{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.086340371904015, "eval_steps": 500, "global_step": 600000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025719503099200123, "grad_norm": 1.539023995399475, "learning_rate": 5e-06, "loss": 10.2802, "step": 500 }, { "epoch": 0.005143900619840025, "grad_norm": 1.201340913772583, "learning_rate": 1e-05, "loss": 9.1163, "step": 1000 }, { "epoch": 0.007715850929760037, "grad_norm": 1.3188607692718506, "learning_rate": 1.5e-05, "loss": 8.454, "step": 1500 }, { "epoch": 0.01028780123968005, "grad_norm": 1.3608899116516113, "learning_rate": 2e-05, "loss": 8.0852, "step": 2000 }, { "epoch": 0.012859751549600063, "grad_norm": 1.5117723941802979, "learning_rate": 2.5e-05, "loss": 7.8255, "step": 2500 }, { "epoch": 0.015431701859520074, "grad_norm": 1.4807263612747192, "learning_rate": 3e-05, "loss": 7.6419, "step": 3000 }, { "epoch": 0.018003652169440085, "grad_norm": 1.9357377290725708, "learning_rate": 3.5e-05, "loss": 7.543, "step": 3500 }, { "epoch": 0.0205756024793601, "grad_norm": 1.4053044319152832, "learning_rate": 4e-05, "loss": 7.4304, "step": 4000 }, { "epoch": 0.023147552789280112, "grad_norm": 1.7819926738739014, "learning_rate": 4.5e-05, "loss": 7.3563, "step": 4500 }, { "epoch": 0.025719503099200125, "grad_norm": 1.5757050514221191, "learning_rate": 5e-05, "loss": 7.2987, "step": 5000 }, { "epoch": 0.028291453409120135, "grad_norm": 1.6142442226409912, "learning_rate": 5.500000000000001e-05, "loss": 7.237, "step": 5500 }, { "epoch": 0.030863403719040148, "grad_norm": 1.4982831478118896, "learning_rate": 6e-05, "loss": 7.1706, "step": 6000 }, { "epoch": 0.03343535402896016, "grad_norm": 2.2230963706970215, "learning_rate": 6.500000000000001e-05, "loss": 7.1074, "step": 6500 }, { "epoch": 0.03600730433888017, "grad_norm": 1.8124334812164307, "learning_rate": 7e-05, "loss": 7.0167, "step": 7000 }, { "epoch": 0.03857925464880019, "grad_norm": 2.228245258331299, "learning_rate": 7.500000000000001e-05, "loss": 6.9312, "step": 7500 }, { "epoch": 0.0411512049587202, "grad_norm": 2.270578145980835, "learning_rate": 8e-05, "loss": 6.8237, "step": 8000 }, { "epoch": 0.04372315526864021, "grad_norm": 2.2809226512908936, "learning_rate": 8.499e-05, "loss": 6.6923, "step": 8500 }, { "epoch": 0.046295105578560224, "grad_norm": 2.0048675537109375, "learning_rate": 8.999000000000001e-05, "loss": 6.552, "step": 9000 }, { "epoch": 0.04886705588848023, "grad_norm": 3.099470853805542, "learning_rate": 9.499e-05, "loss": 6.3139, "step": 9500 }, { "epoch": 0.05143900619840025, "grad_norm": 2.3764562606811523, "learning_rate": 9.999000000000001e-05, "loss": 5.9019, "step": 10000 }, { "epoch": 0.05401095650832026, "grad_norm": 2.565927743911743, "learning_rate": 9.994959595959596e-05, "loss": 5.3657, "step": 10500 }, { "epoch": 0.05658290681824027, "grad_norm": 2.563119649887085, "learning_rate": 9.989909090909091e-05, "loss": 5.0573, "step": 11000 }, { "epoch": 0.059154857128160286, "grad_norm": 2.43448805809021, "learning_rate": 9.984858585858586e-05, "loss": 4.8008, "step": 11500 }, { "epoch": 0.061726807438080296, "grad_norm": 2.7851388454437256, "learning_rate": 9.979808080808082e-05, "loss": 4.5791, "step": 12000 }, { "epoch": 0.06429875774800031, "grad_norm": 3.024442195892334, "learning_rate": 9.974757575757576e-05, "loss": 4.3963, "step": 12500 }, { "epoch": 0.06687070805792032, "grad_norm": 2.799959421157837, "learning_rate": 9.969717171717172e-05, "loss": 4.1974, "step": 13000 }, { "epoch": 0.06944265836784033, "grad_norm": 2.5713512897491455, "learning_rate": 9.964666666666667e-05, "loss": 4.0593, "step": 13500 }, { "epoch": 0.07201460867776034, "grad_norm": 2.840730905532837, "learning_rate": 9.959616161616162e-05, "loss": 3.9324, "step": 14000 }, { "epoch": 0.07458655898768035, "grad_norm": 2.8055996894836426, "learning_rate": 9.954565656565658e-05, "loss": 3.8384, "step": 14500 }, { "epoch": 0.07715850929760038, "grad_norm": 3.108902931213379, "learning_rate": 9.949525252525252e-05, "loss": 3.7442, "step": 15000 }, { "epoch": 0.07973045960752038, "grad_norm": 2.613213539123535, "learning_rate": 9.944474747474748e-05, "loss": 3.6902, "step": 15500 }, { "epoch": 0.0823024099174404, "grad_norm": 3.105239152908325, "learning_rate": 9.939424242424243e-05, "loss": 3.5897, "step": 16000 }, { "epoch": 0.0848743602273604, "grad_norm": 2.4152445793151855, "learning_rate": 9.934373737373737e-05, "loss": 3.5084, "step": 16500 }, { "epoch": 0.08744631053728041, "grad_norm": 2.6828176975250244, "learning_rate": 9.929333333333333e-05, "loss": 3.4708, "step": 17000 }, { "epoch": 0.09001826084720044, "grad_norm": 3.0051541328430176, "learning_rate": 9.92428282828283e-05, "loss": 3.3825, "step": 17500 }, { "epoch": 0.09259021115712045, "grad_norm": 2.8882691860198975, "learning_rate": 9.919232323232324e-05, "loss": 3.3456, "step": 18000 }, { "epoch": 0.09516216146704046, "grad_norm": 2.8528401851654053, "learning_rate": 9.914181818181819e-05, "loss": 3.3129, "step": 18500 }, { "epoch": 0.09773411177696047, "grad_norm": 2.7194721698760986, "learning_rate": 9.909141414141415e-05, "loss": 3.2803, "step": 19000 }, { "epoch": 0.10030606208688048, "grad_norm": 2.450242042541504, "learning_rate": 9.90410101010101e-05, "loss": 3.2348, "step": 19500 }, { "epoch": 0.1028780123968005, "grad_norm": 2.868133068084717, "learning_rate": 9.899050505050505e-05, "loss": 3.1772, "step": 20000 }, { "epoch": 0.10544996270672051, "grad_norm": 2.5029168128967285, "learning_rate": 9.894e-05, "loss": 3.1512, "step": 20500 }, { "epoch": 0.10802191301664052, "grad_norm": 2.388946771621704, "learning_rate": 9.888949494949496e-05, "loss": 3.1072, "step": 21000 }, { "epoch": 0.11059386332656053, "grad_norm": 2.435948371887207, "learning_rate": 9.88389898989899e-05, "loss": 3.1057, "step": 21500 }, { "epoch": 0.11316581363648054, "grad_norm": 2.745619058609009, "learning_rate": 9.878858585858586e-05, "loss": 3.0523, "step": 22000 }, { "epoch": 0.11573776394640055, "grad_norm": 2.6838150024414062, "learning_rate": 9.873808080808081e-05, "loss": 3.023, "step": 22500 }, { "epoch": 0.11830971425632057, "grad_norm": 2.8772313594818115, "learning_rate": 9.868757575757577e-05, "loss": 2.9742, "step": 23000 }, { "epoch": 0.12088166456624058, "grad_norm": 3.1740212440490723, "learning_rate": 9.863707070707072e-05, "loss": 2.9758, "step": 23500 }, { "epoch": 0.12345361487616059, "grad_norm": 3.2220029830932617, "learning_rate": 9.858656565656566e-05, "loss": 2.9479, "step": 24000 }, { "epoch": 0.1260255651860806, "grad_norm": 2.6701834201812744, "learning_rate": 9.853616161616162e-05, "loss": 2.9271, "step": 24500 }, { "epoch": 0.12859751549600063, "grad_norm": 2.7941513061523438, "learning_rate": 9.848565656565657e-05, "loss": 2.9261, "step": 25000 }, { "epoch": 0.13116946580592062, "grad_norm": 2.891564130783081, "learning_rate": 9.843515151515153e-05, "loss": 2.891, "step": 25500 }, { "epoch": 0.13374141611584064, "grad_norm": 2.8883216381073, "learning_rate": 9.838464646464647e-05, "loss": 2.8803, "step": 26000 }, { "epoch": 0.13631336642576067, "grad_norm": 2.4983842372894287, "learning_rate": 9.833424242424243e-05, "loss": 2.8642, "step": 26500 }, { "epoch": 0.13888531673568066, "grad_norm": 2.7084362506866455, "learning_rate": 9.828373737373738e-05, "loss": 2.8195, "step": 27000 }, { "epoch": 0.1414572670456007, "grad_norm": 2.29557204246521, "learning_rate": 9.823323232323233e-05, "loss": 2.814, "step": 27500 }, { "epoch": 0.14402921735552068, "grad_norm": 2.3679752349853516, "learning_rate": 9.818272727272729e-05, "loss": 2.7952, "step": 28000 }, { "epoch": 0.1466011676654407, "grad_norm": 2.6051392555236816, "learning_rate": 9.813232323232325e-05, "loss": 2.7855, "step": 28500 }, { "epoch": 0.1491731179753607, "grad_norm": 3.920278310775757, "learning_rate": 9.808181818181818e-05, "loss": 2.7721, "step": 29000 }, { "epoch": 0.15174506828528073, "grad_norm": 3.2232682704925537, "learning_rate": 9.803131313131314e-05, "loss": 2.7573, "step": 29500 }, { "epoch": 0.15431701859520075, "grad_norm": 2.551081418991089, "learning_rate": 9.798080808080809e-05, "loss": 2.7375, "step": 30000 }, { "epoch": 0.15688896890512075, "grad_norm": 2.425506114959717, "learning_rate": 9.793040404040405e-05, "loss": 2.7166, "step": 30500 }, { "epoch": 0.15946091921504077, "grad_norm": 2.9112095832824707, "learning_rate": 9.7879898989899e-05, "loss": 2.6943, "step": 31000 }, { "epoch": 0.16203286952496077, "grad_norm": 2.865812063217163, "learning_rate": 9.782939393939394e-05, "loss": 2.6932, "step": 31500 }, { "epoch": 0.1646048198348808, "grad_norm": 2.484619379043579, "learning_rate": 9.77788888888889e-05, "loss": 2.6781, "step": 32000 }, { "epoch": 0.1671767701448008, "grad_norm": 2.5136959552764893, "learning_rate": 9.772848484848486e-05, "loss": 2.6624, "step": 32500 }, { "epoch": 0.1697487204547208, "grad_norm": 3.022930860519409, "learning_rate": 9.76779797979798e-05, "loss": 2.6657, "step": 33000 }, { "epoch": 0.17232067076464083, "grad_norm": 2.9088263511657715, "learning_rate": 9.762747474747475e-05, "loss": 2.6475, "step": 33500 }, { "epoch": 0.17489262107456083, "grad_norm": 2.884895086288452, "learning_rate": 9.75769696969697e-05, "loss": 2.6192, "step": 34000 }, { "epoch": 0.17746457138448085, "grad_norm": 2.8403241634368896, "learning_rate": 9.752656565656566e-05, "loss": 2.6216, "step": 34500 }, { "epoch": 0.18003652169440088, "grad_norm": 2.7791824340820312, "learning_rate": 9.747606060606062e-05, "loss": 2.6003, "step": 35000 }, { "epoch": 0.18260847200432087, "grad_norm": 2.736762762069702, "learning_rate": 9.742555555555556e-05, "loss": 2.5981, "step": 35500 }, { "epoch": 0.1851804223142409, "grad_norm": 2.719017744064331, "learning_rate": 9.737505050505051e-05, "loss": 2.5924, "step": 36000 }, { "epoch": 0.1877523726241609, "grad_norm": 3.110269784927368, "learning_rate": 9.732464646464647e-05, "loss": 2.5829, "step": 36500 }, { "epoch": 0.19032432293408091, "grad_norm": 2.6917083263397217, "learning_rate": 9.727414141414141e-05, "loss": 2.569, "step": 37000 }, { "epoch": 0.19289627324400094, "grad_norm": 2.3601632118225098, "learning_rate": 9.722363636363637e-05, "loss": 2.5608, "step": 37500 }, { "epoch": 0.19546822355392093, "grad_norm": 2.266639232635498, "learning_rate": 9.717313131313132e-05, "loss": 2.5411, "step": 38000 }, { "epoch": 0.19804017386384096, "grad_norm": 3.149444818496704, "learning_rate": 9.712262626262627e-05, "loss": 2.5342, "step": 38500 }, { "epoch": 0.20061212417376095, "grad_norm": 2.4720096588134766, "learning_rate": 9.707222222222223e-05, "loss": 2.5222, "step": 39000 }, { "epoch": 0.20318407448368098, "grad_norm": 3.0014114379882812, "learning_rate": 9.702171717171717e-05, "loss": 2.5228, "step": 39500 }, { "epoch": 0.205756024793601, "grad_norm": 3.3219223022460938, "learning_rate": 9.697121212121213e-05, "loss": 2.5232, "step": 40000 }, { "epoch": 0.208327975103521, "grad_norm": 2.2936556339263916, "learning_rate": 9.692070707070708e-05, "loss": 2.5031, "step": 40500 }, { "epoch": 0.21089992541344102, "grad_norm": 2.9339241981506348, "learning_rate": 9.687030303030304e-05, "loss": 2.4811, "step": 41000 }, { "epoch": 0.21347187572336102, "grad_norm": 3.1717493534088135, "learning_rate": 9.681979797979799e-05, "loss": 2.4881, "step": 41500 }, { "epoch": 0.21604382603328104, "grad_norm": 3.3218414783477783, "learning_rate": 9.676929292929293e-05, "loss": 2.4928, "step": 42000 }, { "epoch": 0.21861577634320106, "grad_norm": 2.5804648399353027, "learning_rate": 9.671878787878789e-05, "loss": 2.49, "step": 42500 }, { "epoch": 0.22118772665312106, "grad_norm": 2.6406478881835938, "learning_rate": 9.666838383838385e-05, "loss": 2.4826, "step": 43000 }, { "epoch": 0.22375967696304108, "grad_norm": 2.9224679470062256, "learning_rate": 9.66178787878788e-05, "loss": 2.4601, "step": 43500 }, { "epoch": 0.22633162727296108, "grad_norm": 2.5592384338378906, "learning_rate": 9.656737373737374e-05, "loss": 2.4472, "step": 44000 }, { "epoch": 0.2289035775828811, "grad_norm": 2.8015081882476807, "learning_rate": 9.651686868686869e-05, "loss": 2.4617, "step": 44500 }, { "epoch": 0.2314755278928011, "grad_norm": 3.1833553314208984, "learning_rate": 9.646646464646465e-05, "loss": 2.4373, "step": 45000 }, { "epoch": 0.23404747820272112, "grad_norm": 2.631361961364746, "learning_rate": 9.641595959595961e-05, "loss": 2.4167, "step": 45500 }, { "epoch": 0.23661942851264114, "grad_norm": 2.7147443294525146, "learning_rate": 9.636545454545454e-05, "loss": 2.4266, "step": 46000 }, { "epoch": 0.23919137882256114, "grad_norm": 2.604551315307617, "learning_rate": 9.63149494949495e-05, "loss": 2.4089, "step": 46500 }, { "epoch": 0.24176332913248116, "grad_norm": 2.733030319213867, "learning_rate": 9.626454545454546e-05, "loss": 2.4003, "step": 47000 }, { "epoch": 0.24433527944240116, "grad_norm": 2.895327568054199, "learning_rate": 9.621404040404041e-05, "loss": 2.3862, "step": 47500 }, { "epoch": 0.24690722975232118, "grad_norm": 2.6326534748077393, "learning_rate": 9.616353535353535e-05, "loss": 2.4035, "step": 48000 }, { "epoch": 0.2494791800622412, "grad_norm": 2.8169047832489014, "learning_rate": 9.61130303030303e-05, "loss": 2.3894, "step": 48500 }, { "epoch": 0.2520511303721612, "grad_norm": 2.4093706607818604, "learning_rate": 9.606262626262626e-05, "loss": 2.3925, "step": 49000 }, { "epoch": 0.2546230806820812, "grad_norm": 2.373400926589966, "learning_rate": 9.601212121212122e-05, "loss": 2.385, "step": 49500 }, { "epoch": 0.25719503099200125, "grad_norm": 2.445448160171509, "learning_rate": 9.596161616161617e-05, "loss": 2.3762, "step": 50000 }, { "epoch": 0.2597669813019213, "grad_norm": 2.641312599182129, "learning_rate": 9.591111111111111e-05, "loss": 2.3798, "step": 50500 }, { "epoch": 0.26233893161184124, "grad_norm": 2.40631103515625, "learning_rate": 9.586060606060606e-05, "loss": 2.3656, "step": 51000 }, { "epoch": 0.26491088192176127, "grad_norm": 2.609057664871216, "learning_rate": 9.581020202020202e-05, "loss": 2.365, "step": 51500 }, { "epoch": 0.2674828322316813, "grad_norm": 2.9380626678466797, "learning_rate": 9.575969696969698e-05, "loss": 2.3512, "step": 52000 }, { "epoch": 0.2700547825416013, "grad_norm": 2.5909035205841064, "learning_rate": 9.570919191919193e-05, "loss": 2.3351, "step": 52500 }, { "epoch": 0.27262673285152134, "grad_norm": 2.4578676223754883, "learning_rate": 9.565868686868687e-05, "loss": 2.3516, "step": 53000 }, { "epoch": 0.2751986831614413, "grad_norm": 2.208662748336792, "learning_rate": 9.560828282828283e-05, "loss": 2.3376, "step": 53500 }, { "epoch": 0.27777063347136133, "grad_norm": 2.3018739223480225, "learning_rate": 9.555777777777778e-05, "loss": 2.3327, "step": 54000 }, { "epoch": 0.28034258378128135, "grad_norm": 3.107210159301758, "learning_rate": 9.550727272727274e-05, "loss": 2.3187, "step": 54500 }, { "epoch": 0.2829145340912014, "grad_norm": 2.857588052749634, "learning_rate": 9.545676767676768e-05, "loss": 2.3302, "step": 55000 }, { "epoch": 0.28548648440112134, "grad_norm": 2.459374189376831, "learning_rate": 9.540636363636364e-05, "loss": 2.3125, "step": 55500 }, { "epoch": 0.28805843471104137, "grad_norm": 2.3911349773406982, "learning_rate": 9.535585858585859e-05, "loss": 2.3184, "step": 56000 }, { "epoch": 0.2906303850209614, "grad_norm": 2.443305492401123, "learning_rate": 9.530535353535354e-05, "loss": 2.3133, "step": 56500 }, { "epoch": 0.2932023353308814, "grad_norm": 2.788959503173828, "learning_rate": 9.52548484848485e-05, "loss": 2.3109, "step": 57000 }, { "epoch": 0.29577428564080144, "grad_norm": 2.704943895339966, "learning_rate": 9.520444444444446e-05, "loss": 2.3016, "step": 57500 }, { "epoch": 0.2983462359507214, "grad_norm": 2.6149935722351074, "learning_rate": 9.51539393939394e-05, "loss": 2.2847, "step": 58000 }, { "epoch": 0.30091818626064143, "grad_norm": 2.797826051712036, "learning_rate": 9.510343434343435e-05, "loss": 2.294, "step": 58500 }, { "epoch": 0.30349013657056145, "grad_norm": 2.6312453746795654, "learning_rate": 9.50529292929293e-05, "loss": 2.2853, "step": 59000 }, { "epoch": 0.3060620868804815, "grad_norm": 2.364706039428711, "learning_rate": 9.500242424242425e-05, "loss": 2.274, "step": 59500 }, { "epoch": 0.3086340371904015, "grad_norm": 2.1500983238220215, "learning_rate": 9.495202020202021e-05, "loss": 2.2691, "step": 60000 }, { "epoch": 0.31120598750032147, "grad_norm": 2.4474480152130127, "learning_rate": 9.490151515151515e-05, "loss": 2.2765, "step": 60500 }, { "epoch": 0.3137779378102415, "grad_norm": 2.65130352973938, "learning_rate": 9.48510101010101e-05, "loss": 2.2525, "step": 61000 }, { "epoch": 0.3163498881201615, "grad_norm": 2.6861233711242676, "learning_rate": 9.480050505050505e-05, "loss": 2.2492, "step": 61500 }, { "epoch": 0.31892183843008154, "grad_norm": 3.0400047302246094, "learning_rate": 9.475010101010101e-05, "loss": 2.2565, "step": 62000 }, { "epoch": 0.32149378874000156, "grad_norm": 2.5578489303588867, "learning_rate": 9.469959595959597e-05, "loss": 2.2619, "step": 62500 }, { "epoch": 0.32406573904992153, "grad_norm": 2.904978036880493, "learning_rate": 9.46490909090909e-05, "loss": 2.2284, "step": 63000 }, { "epoch": 0.32663768935984155, "grad_norm": 2.79347562789917, "learning_rate": 9.459858585858586e-05, "loss": 2.2413, "step": 63500 }, { "epoch": 0.3292096396697616, "grad_norm": 2.5674405097961426, "learning_rate": 9.454808080808081e-05, "loss": 2.2501, "step": 64000 }, { "epoch": 0.3317815899796816, "grad_norm": 3.054811716079712, "learning_rate": 9.449767676767677e-05, "loss": 2.2433, "step": 64500 }, { "epoch": 0.3343535402896016, "grad_norm": 2.797732353210449, "learning_rate": 9.444717171717172e-05, "loss": 2.2285, "step": 65000 }, { "epoch": 0.3369254905995216, "grad_norm": 2.077179193496704, "learning_rate": 9.439666666666666e-05, "loss": 2.233, "step": 65500 }, { "epoch": 0.3394974409094416, "grad_norm": 2.7943077087402344, "learning_rate": 9.434616161616162e-05, "loss": 2.2336, "step": 66000 }, { "epoch": 0.34206939121936164, "grad_norm": 2.4715709686279297, "learning_rate": 9.429575757575758e-05, "loss": 2.2161, "step": 66500 }, { "epoch": 0.34464134152928166, "grad_norm": 2.578552484512329, "learning_rate": 9.424525252525253e-05, "loss": 2.2184, "step": 67000 }, { "epoch": 0.3472132918392017, "grad_norm": 2.8192737102508545, "learning_rate": 9.419474747474748e-05, "loss": 2.213, "step": 67500 }, { "epoch": 0.34978524214912166, "grad_norm": 2.719334125518799, "learning_rate": 9.414424242424242e-05, "loss": 2.2029, "step": 68000 }, { "epoch": 0.3523571924590417, "grad_norm": 2.509049892425537, "learning_rate": 9.409373737373738e-05, "loss": 2.2093, "step": 68500 }, { "epoch": 0.3549291427689617, "grad_norm": 2.238666296005249, "learning_rate": 9.404323232323233e-05, "loss": 2.2092, "step": 69000 }, { "epoch": 0.3575010930788817, "grad_norm": 2.2683796882629395, "learning_rate": 9.399272727272727e-05, "loss": 2.2011, "step": 69500 }, { "epoch": 0.36007304338880175, "grad_norm": 2.6098029613494873, "learning_rate": 9.394232323232323e-05, "loss": 2.1919, "step": 70000 }, { "epoch": 0.3626449936987217, "grad_norm": 2.656914234161377, "learning_rate": 9.389181818181818e-05, "loss": 2.2042, "step": 70500 }, { "epoch": 0.36521694400864174, "grad_norm": 2.753380298614502, "learning_rate": 9.384131313131314e-05, "loss": 2.1879, "step": 71000 }, { "epoch": 0.36778889431856177, "grad_norm": 2.4511659145355225, "learning_rate": 9.37909090909091e-05, "loss": 2.1984, "step": 71500 }, { "epoch": 0.3703608446284818, "grad_norm": 2.4932587146759033, "learning_rate": 9.374040404040403e-05, "loss": 2.1822, "step": 72000 }, { "epoch": 0.3729327949384018, "grad_norm": 2.8497045040130615, "learning_rate": 9.368989898989899e-05, "loss": 2.184, "step": 72500 }, { "epoch": 0.3755047452483218, "grad_norm": 2.5217344760894775, "learning_rate": 9.363939393939395e-05, "loss": 2.1765, "step": 73000 }, { "epoch": 0.3780766955582418, "grad_norm": 2.4461801052093506, "learning_rate": 9.35888888888889e-05, "loss": 2.174, "step": 73500 }, { "epoch": 0.38064864586816183, "grad_norm": 2.3911330699920654, "learning_rate": 9.353838383838385e-05, "loss": 2.1655, "step": 74000 }, { "epoch": 0.38322059617808185, "grad_norm": 2.4616994857788086, "learning_rate": 9.348787878787879e-05, "loss": 2.1657, "step": 74500 }, { "epoch": 0.3857925464880019, "grad_norm": 2.8872811794281006, "learning_rate": 9.343737373737375e-05, "loss": 2.1677, "step": 75000 }, { "epoch": 0.38836449679792184, "grad_norm": 2.5439906120300293, "learning_rate": 9.338686868686868e-05, "loss": 2.1727, "step": 75500 }, { "epoch": 0.39093644710784187, "grad_norm": 2.687584638595581, "learning_rate": 9.333636363636364e-05, "loss": 2.1606, "step": 76000 }, { "epoch": 0.3935083974177619, "grad_norm": 2.353545904159546, "learning_rate": 9.328585858585859e-05, "loss": 2.1468, "step": 76500 }, { "epoch": 0.3960803477276819, "grad_norm": 2.3765275478363037, "learning_rate": 9.323545454545455e-05, "loss": 2.1593, "step": 77000 }, { "epoch": 0.39865229803760194, "grad_norm": 2.507904052734375, "learning_rate": 9.31849494949495e-05, "loss": 2.1571, "step": 77500 }, { "epoch": 0.4012242483475219, "grad_norm": 2.3261361122131348, "learning_rate": 9.313444444444444e-05, "loss": 2.1511, "step": 78000 }, { "epoch": 0.40379619865744193, "grad_norm": 2.7640092372894287, "learning_rate": 9.30839393939394e-05, "loss": 2.1538, "step": 78500 }, { "epoch": 0.40636814896736195, "grad_norm": 2.9779064655303955, "learning_rate": 9.303343434343435e-05, "loss": 2.1464, "step": 79000 }, { "epoch": 0.408940099277282, "grad_norm": 2.406595468521118, "learning_rate": 9.298303030303031e-05, "loss": 2.141, "step": 79500 }, { "epoch": 0.411512049587202, "grad_norm": 2.4242331981658936, "learning_rate": 9.293252525252526e-05, "loss": 2.1457, "step": 80000 }, { "epoch": 0.41408399989712197, "grad_norm": 3.289874315261841, "learning_rate": 9.28820202020202e-05, "loss": 2.1143, "step": 80500 }, { "epoch": 0.416655950207042, "grad_norm": 2.4861912727355957, "learning_rate": 9.283151515151516e-05, "loss": 2.1366, "step": 81000 }, { "epoch": 0.419227900516962, "grad_norm": 2.6344797611236572, "learning_rate": 9.278101010101011e-05, "loss": 2.1232, "step": 81500 }, { "epoch": 0.42179985082688204, "grad_norm": 2.6407787799835205, "learning_rate": 9.273060606060607e-05, "loss": 2.1326, "step": 82000 }, { "epoch": 0.42437180113680206, "grad_norm": 2.258902072906494, "learning_rate": 9.268010101010101e-05, "loss": 2.1315, "step": 82500 }, { "epoch": 0.42694375144672203, "grad_norm": 2.644082546234131, "learning_rate": 9.262959595959596e-05, "loss": 2.1234, "step": 83000 }, { "epoch": 0.42951570175664205, "grad_norm": 2.3079209327697754, "learning_rate": 9.257909090909092e-05, "loss": 2.1095, "step": 83500 }, { "epoch": 0.4320876520665621, "grad_norm": 2.5316202640533447, "learning_rate": 9.252858585858585e-05, "loss": 2.1202, "step": 84000 }, { "epoch": 0.4346596023764821, "grad_norm": 2.330894708633423, "learning_rate": 9.247808080808081e-05, "loss": 2.0964, "step": 84500 }, { "epoch": 0.4372315526864021, "grad_norm": 2.368502140045166, "learning_rate": 9.242757575757576e-05, "loss": 2.1111, "step": 85000 }, { "epoch": 0.4398035029963221, "grad_norm": 2.195286512374878, "learning_rate": 9.237707070707072e-05, "loss": 2.1045, "step": 85500 }, { "epoch": 0.4423754533062421, "grad_norm": 2.3127996921539307, "learning_rate": 9.232676767676768e-05, "loss": 2.1115, "step": 86000 }, { "epoch": 0.44494740361616214, "grad_norm": 2.3976833820343018, "learning_rate": 9.227626262626264e-05, "loss": 2.1074, "step": 86500 }, { "epoch": 0.44751935392608216, "grad_norm": 2.5539162158966064, "learning_rate": 9.222575757575757e-05, "loss": 2.0948, "step": 87000 }, { "epoch": 0.4500913042360022, "grad_norm": 2.6165366172790527, "learning_rate": 9.217525252525253e-05, "loss": 2.1028, "step": 87500 }, { "epoch": 0.45266325454592216, "grad_norm": 2.407905101776123, "learning_rate": 9.212484848484849e-05, "loss": 2.0883, "step": 88000 }, { "epoch": 0.4552352048558422, "grad_norm": 2.6928274631500244, "learning_rate": 9.207434343434344e-05, "loss": 2.0961, "step": 88500 }, { "epoch": 0.4578071551657622, "grad_norm": 2.3398540019989014, "learning_rate": 9.20238383838384e-05, "loss": 2.0902, "step": 89000 }, { "epoch": 0.4603791054756822, "grad_norm": 2.5254998207092285, "learning_rate": 9.197333333333333e-05, "loss": 2.0807, "step": 89500 }, { "epoch": 0.4629510557856022, "grad_norm": 2.4837093353271484, "learning_rate": 9.192292929292929e-05, "loss": 2.0843, "step": 90000 }, { "epoch": 0.4655230060955222, "grad_norm": 2.218877077102661, "learning_rate": 9.187242424242425e-05, "loss": 2.0727, "step": 90500 }, { "epoch": 0.46809495640544224, "grad_norm": 2.3560657501220703, "learning_rate": 9.18219191919192e-05, "loss": 2.0905, "step": 91000 }, { "epoch": 0.47066690671536227, "grad_norm": 2.2257909774780273, "learning_rate": 9.177141414141414e-05, "loss": 2.0674, "step": 91500 }, { "epoch": 0.4732388570252823, "grad_norm": 2.5964746475219727, "learning_rate": 9.172090909090909e-05, "loss": 2.0798, "step": 92000 }, { "epoch": 0.47581080733520226, "grad_norm": 3.076167345046997, "learning_rate": 9.167050505050505e-05, "loss": 2.0814, "step": 92500 }, { "epoch": 0.4783827576451223, "grad_norm": 2.514014482498169, "learning_rate": 9.162000000000001e-05, "loss": 2.0795, "step": 93000 }, { "epoch": 0.4809547079550423, "grad_norm": 2.6532745361328125, "learning_rate": 9.156949494949495e-05, "loss": 2.0626, "step": 93500 }, { "epoch": 0.48352665826496233, "grad_norm": 2.583951950073242, "learning_rate": 9.15189898989899e-05, "loss": 2.0759, "step": 94000 }, { "epoch": 0.48609860857488235, "grad_norm": 2.2346367835998535, "learning_rate": 9.146858585858586e-05, "loss": 2.0581, "step": 94500 }, { "epoch": 0.4886705588848023, "grad_norm": 2.8640918731689453, "learning_rate": 9.141818181818182e-05, "loss": 2.0758, "step": 95000 }, { "epoch": 0.49124250919472234, "grad_norm": 2.441415548324585, "learning_rate": 9.136767676767677e-05, "loss": 2.0634, "step": 95500 }, { "epoch": 0.49381445950464237, "grad_norm": 2.3812661170959473, "learning_rate": 9.131717171717173e-05, "loss": 2.0711, "step": 96000 }, { "epoch": 0.4963864098145624, "grad_norm": 2.4360954761505127, "learning_rate": 9.126666666666667e-05, "loss": 2.0472, "step": 96500 }, { "epoch": 0.4989583601244824, "grad_norm": 2.4400837421417236, "learning_rate": 9.121616161616162e-05, "loss": 2.0556, "step": 97000 }, { "epoch": 0.5015303104344024, "grad_norm": 2.2711386680603027, "learning_rate": 9.116565656565656e-05, "loss": 2.0581, "step": 97500 }, { "epoch": 0.5041022607443224, "grad_norm": 2.3467113971710205, "learning_rate": 9.111515151515152e-05, "loss": 2.0642, "step": 98000 }, { "epoch": 0.5066742110542425, "grad_norm": 2.3878612518310547, "learning_rate": 9.106464646464646e-05, "loss": 2.0561, "step": 98500 }, { "epoch": 0.5092461613641625, "grad_norm": 2.205702066421509, "learning_rate": 9.101424242424243e-05, "loss": 2.0618, "step": 99000 }, { "epoch": 0.5118181116740824, "grad_norm": 2.889329195022583, "learning_rate": 9.096373737373738e-05, "loss": 2.0457, "step": 99500 }, { "epoch": 0.5143900619840025, "grad_norm": 2.655266761779785, "learning_rate": 9.091323232323232e-05, "loss": 2.036, "step": 100000 }, { "epoch": 0.5169620122939225, "grad_norm": 2.392587900161743, "learning_rate": 9.086272727272728e-05, "loss": 2.0485, "step": 100500 }, { "epoch": 0.5195339626038425, "grad_norm": 2.4802868366241455, "learning_rate": 9.081242424242424e-05, "loss": 2.0478, "step": 101000 }, { "epoch": 0.5221059129137625, "grad_norm": 2.0449373722076416, "learning_rate": 9.07619191919192e-05, "loss": 2.0337, "step": 101500 }, { "epoch": 0.5246778632236825, "grad_norm": 2.384089946746826, "learning_rate": 9.071141414141415e-05, "loss": 2.0171, "step": 102000 }, { "epoch": 0.5272498135336026, "grad_norm": 2.4087131023406982, "learning_rate": 9.06609090909091e-05, "loss": 2.0412, "step": 102500 }, { "epoch": 0.5298217638435225, "grad_norm": 2.774549961090088, "learning_rate": 9.061040404040404e-05, "loss": 2.0364, "step": 103000 }, { "epoch": 0.5323937141534426, "grad_norm": 2.3224120140075684, "learning_rate": 9.0559898989899e-05, "loss": 2.0317, "step": 103500 }, { "epoch": 0.5349656644633626, "grad_norm": 2.3578784465789795, "learning_rate": 9.050939393939393e-05, "loss": 2.0221, "step": 104000 }, { "epoch": 0.5375376147732825, "grad_norm": 2.2884316444396973, "learning_rate": 9.04588888888889e-05, "loss": 2.0284, "step": 104500 }, { "epoch": 0.5401095650832026, "grad_norm": 2.560002326965332, "learning_rate": 9.040838383838385e-05, "loss": 2.0176, "step": 105000 }, { "epoch": 0.5426815153931226, "grad_norm": 2.5888469219207764, "learning_rate": 9.035787878787879e-05, "loss": 2.0265, "step": 105500 }, { "epoch": 0.5452534657030427, "grad_norm": 2.30547833442688, "learning_rate": 9.030737373737375e-05, "loss": 2.0285, "step": 106000 }, { "epoch": 0.5478254160129626, "grad_norm": 2.7285144329071045, "learning_rate": 9.02569696969697e-05, "loss": 2.0138, "step": 106500 }, { "epoch": 0.5503973663228826, "grad_norm": 2.7011213302612305, "learning_rate": 9.020646464646465e-05, "loss": 2.0173, "step": 107000 }, { "epoch": 0.5529693166328027, "grad_norm": 2.5760533809661865, "learning_rate": 9.01559595959596e-05, "loss": 2.0091, "step": 107500 }, { "epoch": 0.5555412669427227, "grad_norm": 2.6803488731384277, "learning_rate": 9.010545454545454e-05, "loss": 2.0199, "step": 108000 }, { "epoch": 0.5581132172526426, "grad_norm": 2.3978312015533447, "learning_rate": 9.00549494949495e-05, "loss": 2.0109, "step": 108500 }, { "epoch": 0.5606851675625627, "grad_norm": 2.530170202255249, "learning_rate": 9.000454545454546e-05, "loss": 2.0085, "step": 109000 }, { "epoch": 0.5632571178724827, "grad_norm": 2.43276309967041, "learning_rate": 8.995404040404041e-05, "loss": 1.9883, "step": 109500 }, { "epoch": 0.5658290681824028, "grad_norm": 2.404324531555176, "learning_rate": 8.990353535353536e-05, "loss": 1.9948, "step": 110000 }, { "epoch": 0.5684010184923227, "grad_norm": 2.732954740524292, "learning_rate": 8.98530303030303e-05, "loss": 1.993, "step": 110500 }, { "epoch": 0.5709729688022427, "grad_norm": 2.43375563621521, "learning_rate": 8.980262626262626e-05, "loss": 2.0089, "step": 111000 }, { "epoch": 0.5735449191121628, "grad_norm": 2.824575662612915, "learning_rate": 8.975212121212122e-05, "loss": 2.0035, "step": 111500 }, { "epoch": 0.5761168694220827, "grad_norm": 2.329143524169922, "learning_rate": 8.970161616161617e-05, "loss": 1.9987, "step": 112000 }, { "epoch": 0.5786888197320028, "grad_norm": 2.400956392288208, "learning_rate": 8.965111111111112e-05, "loss": 1.9997, "step": 112500 }, { "epoch": 0.5812607700419228, "grad_norm": 2.737149953842163, "learning_rate": 8.960070707070707e-05, "loss": 1.9925, "step": 113000 }, { "epoch": 0.5838327203518427, "grad_norm": 2.3448445796966553, "learning_rate": 8.955020202020202e-05, "loss": 2.0024, "step": 113500 }, { "epoch": 0.5864046706617628, "grad_norm": 2.8817691802978516, "learning_rate": 8.949969696969698e-05, "loss": 1.9896, "step": 114000 }, { "epoch": 0.5889766209716828, "grad_norm": 2.6253979206085205, "learning_rate": 8.944929292929294e-05, "loss": 1.985, "step": 114500 }, { "epoch": 0.5915485712816029, "grad_norm": 2.2400150299072266, "learning_rate": 8.939878787878789e-05, "loss": 1.9898, "step": 115000 }, { "epoch": 0.5941205215915228, "grad_norm": 2.847470760345459, "learning_rate": 8.934828282828283e-05, "loss": 1.9939, "step": 115500 }, { "epoch": 0.5966924719014428, "grad_norm": 3.0489280223846436, "learning_rate": 8.929777777777778e-05, "loss": 1.9968, "step": 116000 }, { "epoch": 0.5992644222113629, "grad_norm": 2.2937958240509033, "learning_rate": 8.924727272727274e-05, "loss": 1.9976, "step": 116500 }, { "epoch": 0.6018363725212829, "grad_norm": 2.5017504692077637, "learning_rate": 8.919676767676767e-05, "loss": 1.9912, "step": 117000 }, { "epoch": 0.6044083228312029, "grad_norm": 2.5178418159484863, "learning_rate": 8.914626262626263e-05, "loss": 1.9854, "step": 117500 }, { "epoch": 0.6069802731411229, "grad_norm": 2.754523515701294, "learning_rate": 8.909575757575758e-05, "loss": 1.9814, "step": 118000 }, { "epoch": 0.6095522234510429, "grad_norm": 2.4813973903656006, "learning_rate": 8.904525252525252e-05, "loss": 1.994, "step": 118500 }, { "epoch": 0.612124173760963, "grad_norm": 2.0074260234832764, "learning_rate": 8.89948484848485e-05, "loss": 1.9778, "step": 119000 }, { "epoch": 0.6146961240708829, "grad_norm": 2.4869885444641113, "learning_rate": 8.894434343434343e-05, "loss": 1.9809, "step": 119500 }, { "epoch": 0.617268074380803, "grad_norm": 2.464909315109253, "learning_rate": 8.889383838383839e-05, "loss": 1.9671, "step": 120000 }, { "epoch": 0.619840024690723, "grad_norm": 2.330047130584717, "learning_rate": 8.884333333333334e-05, "loss": 1.9712, "step": 120500 }, { "epoch": 0.6224119750006429, "grad_norm": 2.894199848175049, "learning_rate": 8.87929292929293e-05, "loss": 1.9747, "step": 121000 }, { "epoch": 0.624983925310563, "grad_norm": 2.962379217147827, "learning_rate": 8.874242424242424e-05, "loss": 1.9689, "step": 121500 }, { "epoch": 0.627555875620483, "grad_norm": 3.0637989044189453, "learning_rate": 8.869191919191919e-05, "loss": 1.967, "step": 122000 }, { "epoch": 0.6301278259304031, "grad_norm": 2.25830078125, "learning_rate": 8.864141414141415e-05, "loss": 1.9635, "step": 122500 }, { "epoch": 0.632699776240323, "grad_norm": 2.3451120853424072, "learning_rate": 8.85909090909091e-05, "loss": 1.9664, "step": 123000 }, { "epoch": 0.635271726550243, "grad_norm": 2.26731538772583, "learning_rate": 8.854050505050506e-05, "loss": 1.9561, "step": 123500 }, { "epoch": 0.6378436768601631, "grad_norm": 2.3904566764831543, "learning_rate": 8.849e-05, "loss": 1.9619, "step": 124000 }, { "epoch": 0.640415627170083, "grad_norm": 2.415607213973999, "learning_rate": 8.843949494949495e-05, "loss": 1.9748, "step": 124500 }, { "epoch": 0.6429875774800031, "grad_norm": 2.9378740787506104, "learning_rate": 8.838898989898991e-05, "loss": 1.9602, "step": 125000 }, { "epoch": 0.6455595277899231, "grad_norm": 2.1163997650146484, "learning_rate": 8.833858585858587e-05, "loss": 1.9498, "step": 125500 }, { "epoch": 0.6481314780998431, "grad_norm": 2.2119147777557373, "learning_rate": 8.828818181818183e-05, "loss": 1.9623, "step": 126000 }, { "epoch": 0.6507034284097631, "grad_norm": 3.078888416290283, "learning_rate": 8.823767676767677e-05, "loss": 1.9501, "step": 126500 }, { "epoch": 0.6532753787196831, "grad_norm": 3.1210856437683105, "learning_rate": 8.818717171717172e-05, "loss": 1.963, "step": 127000 }, { "epoch": 0.6558473290296032, "grad_norm": 2.1710915565490723, "learning_rate": 8.813666666666667e-05, "loss": 1.9418, "step": 127500 }, { "epoch": 0.6584192793395232, "grad_norm": 2.1447181701660156, "learning_rate": 8.808616161616163e-05, "loss": 1.9669, "step": 128000 }, { "epoch": 0.6609912296494431, "grad_norm": 3.214812994003296, "learning_rate": 8.803565656565657e-05, "loss": 1.9356, "step": 128500 }, { "epoch": 0.6635631799593632, "grad_norm": 2.4240269660949707, "learning_rate": 8.798515151515152e-05, "loss": 1.9637, "step": 129000 }, { "epoch": 0.6661351302692832, "grad_norm": 2.5283048152923584, "learning_rate": 8.793474747474748e-05, "loss": 1.948, "step": 129500 }, { "epoch": 0.6687070805792033, "grad_norm": 2.215092182159424, "learning_rate": 8.788424242424242e-05, "loss": 1.9401, "step": 130000 }, { "epoch": 0.6712790308891232, "grad_norm": 2.387033462524414, "learning_rate": 8.783373737373738e-05, "loss": 1.9403, "step": 130500 }, { "epoch": 0.6738509811990432, "grad_norm": 2.3272926807403564, "learning_rate": 8.778323232323232e-05, "loss": 1.9288, "step": 131000 }, { "epoch": 0.6764229315089633, "grad_norm": 2.4151089191436768, "learning_rate": 8.773272727272728e-05, "loss": 1.9528, "step": 131500 }, { "epoch": 0.6789948818188832, "grad_norm": 2.122108221054077, "learning_rate": 8.768222222222222e-05, "loss": 1.9403, "step": 132000 }, { "epoch": 0.6815668321288033, "grad_norm": 2.8606338500976562, "learning_rate": 8.763171717171717e-05, "loss": 1.938, "step": 132500 }, { "epoch": 0.6841387824387233, "grad_norm": 2.832679510116577, "learning_rate": 8.758121212121213e-05, "loss": 1.9498, "step": 133000 }, { "epoch": 0.6867107327486432, "grad_norm": 2.884164571762085, "learning_rate": 8.753080808080808e-05, "loss": 1.9339, "step": 133500 }, { "epoch": 0.6892826830585633, "grad_norm": 2.577549457550049, "learning_rate": 8.748030303030304e-05, "loss": 1.9456, "step": 134000 }, { "epoch": 0.6918546333684833, "grad_norm": 2.42988920211792, "learning_rate": 8.7429797979798e-05, "loss": 1.9448, "step": 134500 }, { "epoch": 0.6944265836784034, "grad_norm": 2.5420963764190674, "learning_rate": 8.737929292929293e-05, "loss": 1.9261, "step": 135000 }, { "epoch": 0.6969985339883233, "grad_norm": 2.6064467430114746, "learning_rate": 8.732888888888889e-05, "loss": 1.9288, "step": 135500 }, { "epoch": 0.6995704842982433, "grad_norm": 2.149203062057495, "learning_rate": 8.727838383838383e-05, "loss": 1.9246, "step": 136000 }, { "epoch": 0.7021424346081634, "grad_norm": 2.064519166946411, "learning_rate": 8.72278787878788e-05, "loss": 1.9379, "step": 136500 }, { "epoch": 0.7047143849180834, "grad_norm": 2.159180164337158, "learning_rate": 8.717737373737374e-05, "loss": 1.9389, "step": 137000 }, { "epoch": 0.7072863352280034, "grad_norm": 2.478998899459839, "learning_rate": 8.71269696969697e-05, "loss": 1.9331, "step": 137500 }, { "epoch": 0.7098582855379234, "grad_norm": 2.2875208854675293, "learning_rate": 8.707646464646465e-05, "loss": 1.9263, "step": 138000 }, { "epoch": 0.7124302358478434, "grad_norm": 2.595557928085327, "learning_rate": 8.70259595959596e-05, "loss": 1.9253, "step": 138500 }, { "epoch": 0.7150021861577635, "grad_norm": 2.872157573699951, "learning_rate": 8.697545454545455e-05, "loss": 1.9149, "step": 139000 }, { "epoch": 0.7175741364676834, "grad_norm": 2.4363973140716553, "learning_rate": 8.69249494949495e-05, "loss": 1.9164, "step": 139500 }, { "epoch": 0.7201460867776035, "grad_norm": 2.8040812015533447, "learning_rate": 8.687444444444445e-05, "loss": 1.9134, "step": 140000 }, { "epoch": 0.7227180370875235, "grad_norm": 2.6890177726745605, "learning_rate": 8.68239393939394e-05, "loss": 1.9223, "step": 140500 }, { "epoch": 0.7252899873974434, "grad_norm": 2.4290647506713867, "learning_rate": 8.677343434343435e-05, "loss": 1.9172, "step": 141000 }, { "epoch": 0.7278619377073635, "grad_norm": 2.398864984512329, "learning_rate": 8.672303030303031e-05, "loss": 1.907, "step": 141500 }, { "epoch": 0.7304338880172835, "grad_norm": 2.4179599285125732, "learning_rate": 8.667252525252526e-05, "loss": 1.9108, "step": 142000 }, { "epoch": 0.7330058383272036, "grad_norm": 2.6131629943847656, "learning_rate": 8.66220202020202e-05, "loss": 1.9067, "step": 142500 }, { "epoch": 0.7355777886371235, "grad_norm": 2.302748203277588, "learning_rate": 8.657161616161616e-05, "loss": 1.9104, "step": 143000 }, { "epoch": 0.7381497389470435, "grad_norm": 2.1994614601135254, "learning_rate": 8.652111111111112e-05, "loss": 1.9173, "step": 143500 }, { "epoch": 0.7407216892569636, "grad_norm": 2.1997227668762207, "learning_rate": 8.647060606060607e-05, "loss": 1.9001, "step": 144000 }, { "epoch": 0.7432936395668835, "grad_norm": 2.480407953262329, "learning_rate": 8.642010101010102e-05, "loss": 1.9153, "step": 144500 }, { "epoch": 0.7458655898768036, "grad_norm": 2.447983503341675, "learning_rate": 8.636959595959596e-05, "loss": 1.9147, "step": 145000 }, { "epoch": 0.7484375401867236, "grad_norm": 2.3080880641937256, "learning_rate": 8.631919191919192e-05, "loss": 1.919, "step": 145500 }, { "epoch": 0.7510094904966436, "grad_norm": 2.5869462490081787, "learning_rate": 8.626868686868688e-05, "loss": 1.9078, "step": 146000 }, { "epoch": 0.7535814408065636, "grad_norm": 2.248598098754883, "learning_rate": 8.621818181818181e-05, "loss": 1.9036, "step": 146500 }, { "epoch": 0.7561533911164836, "grad_norm": 2.336503267288208, "learning_rate": 8.616767676767677e-05, "loss": 1.9049, "step": 147000 }, { "epoch": 0.7587253414264037, "grad_norm": 2.6740052700042725, "learning_rate": 8.611717171717172e-05, "loss": 1.8945, "step": 147500 }, { "epoch": 0.7612972917363237, "grad_norm": 2.3795812129974365, "learning_rate": 8.606676767676768e-05, "loss": 1.8985, "step": 148000 }, { "epoch": 0.7638692420462436, "grad_norm": 2.3991169929504395, "learning_rate": 8.601626262626264e-05, "loss": 1.8997, "step": 148500 }, { "epoch": 0.7664411923561637, "grad_norm": 2.6228420734405518, "learning_rate": 8.596575757575757e-05, "loss": 1.8892, "step": 149000 }, { "epoch": 0.7690131426660837, "grad_norm": 2.6543805599212646, "learning_rate": 8.591525252525253e-05, "loss": 1.9133, "step": 149500 }, { "epoch": 0.7715850929760038, "grad_norm": 2.5980093479156494, "learning_rate": 8.586474747474748e-05, "loss": 1.8999, "step": 150000 }, { "epoch": 0.7741570432859237, "grad_norm": 2.239975690841675, "learning_rate": 8.581434343434344e-05, "loss": 1.9011, "step": 150500 }, { "epoch": 0.7767289935958437, "grad_norm": 2.4112389087677, "learning_rate": 8.576383838383839e-05, "loss": 1.8845, "step": 151000 }, { "epoch": 0.7793009439057638, "grad_norm": 2.379509210586548, "learning_rate": 8.571333333333333e-05, "loss": 1.896, "step": 151500 }, { "epoch": 0.7818728942156837, "grad_norm": 2.4327831268310547, "learning_rate": 8.566282828282829e-05, "loss": 1.8935, "step": 152000 }, { "epoch": 0.7844448445256038, "grad_norm": 2.5598642826080322, "learning_rate": 8.561232323232324e-05, "loss": 1.8996, "step": 152500 }, { "epoch": 0.7870167948355238, "grad_norm": 2.7298407554626465, "learning_rate": 8.556181818181818e-05, "loss": 1.8954, "step": 153000 }, { "epoch": 0.7895887451454437, "grad_norm": 2.6706230640411377, "learning_rate": 8.551131313131313e-05, "loss": 1.8865, "step": 153500 }, { "epoch": 0.7921606954553638, "grad_norm": 2.7836761474609375, "learning_rate": 8.546080808080809e-05, "loss": 1.8922, "step": 154000 }, { "epoch": 0.7947326457652838, "grad_norm": 2.4677138328552246, "learning_rate": 8.541040404040405e-05, "loss": 1.8744, "step": 154500 }, { "epoch": 0.7973045960752039, "grad_norm": 2.629953384399414, "learning_rate": 8.5359898989899e-05, "loss": 1.8801, "step": 155000 }, { "epoch": 0.7998765463851238, "grad_norm": 2.1538336277008057, "learning_rate": 8.530939393939394e-05, "loss": 1.8766, "step": 155500 }, { "epoch": 0.8024484966950438, "grad_norm": 2.37500262260437, "learning_rate": 8.525888888888889e-05, "loss": 1.8827, "step": 156000 }, { "epoch": 0.8050204470049639, "grad_norm": 2.6441307067871094, "learning_rate": 8.520848484848485e-05, "loss": 1.8739, "step": 156500 }, { "epoch": 0.8075923973148839, "grad_norm": 2.8131062984466553, "learning_rate": 8.515797979797981e-05, "loss": 1.8777, "step": 157000 }, { "epoch": 0.8101643476248039, "grad_norm": 2.25876784324646, "learning_rate": 8.510757575757577e-05, "loss": 1.891, "step": 157500 }, { "epoch": 0.8127362979347239, "grad_norm": 2.397202253341675, "learning_rate": 8.50570707070707e-05, "loss": 1.8917, "step": 158000 }, { "epoch": 0.8153082482446439, "grad_norm": 2.5230774879455566, "learning_rate": 8.500656565656566e-05, "loss": 1.9009, "step": 158500 }, { "epoch": 0.817880198554564, "grad_norm": 2.8625664710998535, "learning_rate": 8.495606060606061e-05, "loss": 1.8902, "step": 159000 }, { "epoch": 0.8204521488644839, "grad_norm": 2.3342695236206055, "learning_rate": 8.490555555555557e-05, "loss": 1.8664, "step": 159500 }, { "epoch": 0.823024099174404, "grad_norm": 2.483473777770996, "learning_rate": 8.48550505050505e-05, "loss": 1.8791, "step": 160000 }, { "epoch": 0.825596049484324, "grad_norm": 2.270512342453003, "learning_rate": 8.480454545454546e-05, "loss": 1.8758, "step": 160500 }, { "epoch": 0.8281679997942439, "grad_norm": 2.4790780544281006, "learning_rate": 8.475404040404042e-05, "loss": 1.8816, "step": 161000 }, { "epoch": 0.830739950104164, "grad_norm": 2.4023377895355225, "learning_rate": 8.470363636363637e-05, "loss": 1.8783, "step": 161500 }, { "epoch": 0.833311900414084, "grad_norm": 2.6411328315734863, "learning_rate": 8.465313131313131e-05, "loss": 1.8691, "step": 162000 }, { "epoch": 0.8358838507240041, "grad_norm": 2.2638540267944336, "learning_rate": 8.460262626262627e-05, "loss": 1.8545, "step": 162500 }, { "epoch": 0.838455801033924, "grad_norm": 2.785778522491455, "learning_rate": 8.455212121212122e-05, "loss": 1.8755, "step": 163000 }, { "epoch": 0.841027751343844, "grad_norm": 2.2858121395111084, "learning_rate": 8.450171717171718e-05, "loss": 1.8659, "step": 163500 }, { "epoch": 0.8435997016537641, "grad_norm": 2.7761781215667725, "learning_rate": 8.445121212121212e-05, "loss": 1.8673, "step": 164000 }, { "epoch": 0.846171651963684, "grad_norm": 3.0068702697753906, "learning_rate": 8.440070707070707e-05, "loss": 1.8599, "step": 164500 }, { "epoch": 0.8487436022736041, "grad_norm": 2.3816988468170166, "learning_rate": 8.435020202020203e-05, "loss": 1.8687, "step": 165000 }, { "epoch": 0.8513155525835241, "grad_norm": 2.7806084156036377, "learning_rate": 8.429979797979798e-05, "loss": 1.8633, "step": 165500 }, { "epoch": 0.8538875028934441, "grad_norm": 2.572535753250122, "learning_rate": 8.424929292929294e-05, "loss": 1.8586, "step": 166000 }, { "epoch": 0.8564594532033641, "grad_norm": 2.6891589164733887, "learning_rate": 8.419878787878788e-05, "loss": 1.8829, "step": 166500 }, { "epoch": 0.8590314035132841, "grad_norm": 2.2894322872161865, "learning_rate": 8.414828282828283e-05, "loss": 1.8539, "step": 167000 }, { "epoch": 0.8616033538232042, "grad_norm": 2.343632459640503, "learning_rate": 8.409787878787879e-05, "loss": 1.8492, "step": 167500 }, { "epoch": 0.8641753041331242, "grad_norm": 2.1601314544677734, "learning_rate": 8.404737373737375e-05, "loss": 1.869, "step": 168000 }, { "epoch": 0.8667472544430441, "grad_norm": 2.3659918308258057, "learning_rate": 8.39968686868687e-05, "loss": 1.8586, "step": 168500 }, { "epoch": 0.8693192047529642, "grad_norm": 1.9559909105300903, "learning_rate": 8.394646464646465e-05, "loss": 1.8535, "step": 169000 }, { "epoch": 0.8718911550628842, "grad_norm": 2.3367204666137695, "learning_rate": 8.38959595959596e-05, "loss": 1.8438, "step": 169500 }, { "epoch": 0.8744631053728043, "grad_norm": 2.5470831394195557, "learning_rate": 8.384545454545455e-05, "loss": 1.8715, "step": 170000 }, { "epoch": 0.8770350556827242, "grad_norm": 1.9904810190200806, "learning_rate": 8.379494949494951e-05, "loss": 1.837, "step": 170500 }, { "epoch": 0.8796070059926442, "grad_norm": 2.808014392852783, "learning_rate": 8.374444444444445e-05, "loss": 1.8473, "step": 171000 }, { "epoch": 0.8821789563025643, "grad_norm": 2.3761932849884033, "learning_rate": 8.36939393939394e-05, "loss": 1.8492, "step": 171500 }, { "epoch": 0.8847509066124842, "grad_norm": 2.5445032119750977, "learning_rate": 8.364343434343435e-05, "loss": 1.8537, "step": 172000 }, { "epoch": 0.8873228569224043, "grad_norm": 2.6148016452789307, "learning_rate": 8.35929292929293e-05, "loss": 1.8507, "step": 172500 }, { "epoch": 0.8898948072323243, "grad_norm": 2.4389026165008545, "learning_rate": 8.354242424242424e-05, "loss": 1.8421, "step": 173000 }, { "epoch": 0.8924667575422442, "grad_norm": 2.1091599464416504, "learning_rate": 8.34920202020202e-05, "loss": 1.8543, "step": 173500 }, { "epoch": 0.8950387078521643, "grad_norm": 2.5214107036590576, "learning_rate": 8.344151515151516e-05, "loss": 1.8516, "step": 174000 }, { "epoch": 0.8976106581620843, "grad_norm": 2.6828722953796387, "learning_rate": 8.33910101010101e-05, "loss": 1.8537, "step": 174500 }, { "epoch": 0.9001826084720044, "grad_norm": 2.204803943634033, "learning_rate": 8.334050505050506e-05, "loss": 1.8668, "step": 175000 }, { "epoch": 0.9027545587819243, "grad_norm": 2.917100191116333, "learning_rate": 8.329e-05, "loss": 1.8423, "step": 175500 }, { "epoch": 0.9053265090918443, "grad_norm": 2.2125914096832275, "learning_rate": 8.323959595959596e-05, "loss": 1.8403, "step": 176000 }, { "epoch": 0.9078984594017644, "grad_norm": 2.3068203926086426, "learning_rate": 8.318909090909092e-05, "loss": 1.8499, "step": 176500 }, { "epoch": 0.9104704097116844, "grad_norm": 2.733078956604004, "learning_rate": 8.313868686868688e-05, "loss": 1.8387, "step": 177000 }, { "epoch": 0.9130423600216044, "grad_norm": 2.5091042518615723, "learning_rate": 8.308818181818182e-05, "loss": 1.8362, "step": 177500 }, { "epoch": 0.9156143103315244, "grad_norm": 2.4861273765563965, "learning_rate": 8.303767676767677e-05, "loss": 1.8394, "step": 178000 }, { "epoch": 0.9181862606414444, "grad_norm": 2.519242286682129, "learning_rate": 8.298717171717172e-05, "loss": 1.832, "step": 178500 }, { "epoch": 0.9207582109513645, "grad_norm": 2.075767993927002, "learning_rate": 8.293666666666668e-05, "loss": 1.8362, "step": 179000 }, { "epoch": 0.9233301612612844, "grad_norm": 2.563034772872925, "learning_rate": 8.288616161616162e-05, "loss": 1.8355, "step": 179500 }, { "epoch": 0.9259021115712044, "grad_norm": 2.5027518272399902, "learning_rate": 8.283565656565657e-05, "loss": 1.8337, "step": 180000 }, { "epoch": 0.9284740618811245, "grad_norm": 2.341482162475586, "learning_rate": 8.278525252525253e-05, "loss": 1.8452, "step": 180500 }, { "epoch": 0.9310460121910444, "grad_norm": 2.5052967071533203, "learning_rate": 8.273474747474747e-05, "loss": 1.8337, "step": 181000 }, { "epoch": 0.9336179625009645, "grad_norm": 2.9151535034179688, "learning_rate": 8.268424242424243e-05, "loss": 1.8323, "step": 181500 }, { "epoch": 0.9361899128108845, "grad_norm": 2.3366811275482178, "learning_rate": 8.263383838383839e-05, "loss": 1.8286, "step": 182000 }, { "epoch": 0.9387618631208045, "grad_norm": 2.044461727142334, "learning_rate": 8.258333333333334e-05, "loss": 1.8345, "step": 182500 }, { "epoch": 0.9413338134307245, "grad_norm": 2.488086223602295, "learning_rate": 8.253282828282829e-05, "loss": 1.8349, "step": 183000 }, { "epoch": 0.9439057637406445, "grad_norm": 2.246419906616211, "learning_rate": 8.248232323232323e-05, "loss": 1.824, "step": 183500 }, { "epoch": 0.9464777140505646, "grad_norm": 2.0991148948669434, "learning_rate": 8.243181818181819e-05, "loss": 1.8322, "step": 184000 }, { "epoch": 0.9490496643604845, "grad_norm": 2.6441781520843506, "learning_rate": 8.238131313131312e-05, "loss": 1.8341, "step": 184500 }, { "epoch": 0.9516216146704045, "grad_norm": 2.344884157180786, "learning_rate": 8.233080808080808e-05, "loss": 1.8177, "step": 185000 }, { "epoch": 0.9541935649803246, "grad_norm": 2.5357608795166016, "learning_rate": 8.228030303030303e-05, "loss": 1.8263, "step": 185500 }, { "epoch": 0.9567655152902446, "grad_norm": 2.7352442741394043, "learning_rate": 8.222979797979799e-05, "loss": 1.8293, "step": 186000 }, { "epoch": 0.9593374656001646, "grad_norm": 2.9389710426330566, "learning_rate": 8.217929292929292e-05, "loss": 1.8141, "step": 186500 }, { "epoch": 0.9619094159100846, "grad_norm": 2.38529634475708, "learning_rate": 8.212878787878788e-05, "loss": 1.813, "step": 187000 }, { "epoch": 0.9644813662200046, "grad_norm": 2.8772764205932617, "learning_rate": 8.207838383838384e-05, "loss": 1.8231, "step": 187500 }, { "epoch": 0.9670533165299247, "grad_norm": 2.1025900840759277, "learning_rate": 8.202787878787879e-05, "loss": 1.8218, "step": 188000 }, { "epoch": 0.9696252668398446, "grad_norm": 2.149860382080078, "learning_rate": 8.197737373737374e-05, "loss": 1.8163, "step": 188500 }, { "epoch": 0.9721972171497647, "grad_norm": 2.2093310356140137, "learning_rate": 8.19268686868687e-05, "loss": 1.8222, "step": 189000 }, { "epoch": 0.9747691674596847, "grad_norm": 2.126584053039551, "learning_rate": 8.187636363636364e-05, "loss": 1.8139, "step": 189500 }, { "epoch": 0.9773411177696046, "grad_norm": 2.6543593406677246, "learning_rate": 8.182585858585859e-05, "loss": 1.8258, "step": 190000 }, { "epoch": 0.9799130680795247, "grad_norm": 3.2399909496307373, "learning_rate": 8.177535353535353e-05, "loss": 1.8066, "step": 190500 }, { "epoch": 0.9824850183894447, "grad_norm": 2.757171392440796, "learning_rate": 8.17249494949495e-05, "loss": 1.8082, "step": 191000 }, { "epoch": 0.9850569686993648, "grad_norm": 2.164072036743164, "learning_rate": 8.167444444444445e-05, "loss": 1.8214, "step": 191500 }, { "epoch": 0.9876289190092847, "grad_norm": 2.501775026321411, "learning_rate": 8.16239393939394e-05, "loss": 1.813, "step": 192000 }, { "epoch": 0.9902008693192047, "grad_norm": 2.7152421474456787, "learning_rate": 8.157343434343435e-05, "loss": 1.8174, "step": 192500 }, { "epoch": 0.9927728196291248, "grad_norm": 2.667201519012451, "learning_rate": 8.15229292929293e-05, "loss": 1.8253, "step": 193000 }, { "epoch": 0.9953447699390447, "grad_norm": 2.656597375869751, "learning_rate": 8.147242424242425e-05, "loss": 1.8091, "step": 193500 }, { "epoch": 0.9979167202489648, "grad_norm": 2.635948896408081, "learning_rate": 8.14219191919192e-05, "loss": 1.8127, "step": 194000 }, { "epoch": 1.000488670558885, "grad_norm": 2.38082218170166, "learning_rate": 8.137141414141415e-05, "loss": 1.8222, "step": 194500 }, { "epoch": 1.0030606208688049, "grad_norm": 3.0616064071655273, "learning_rate": 8.132090909090909e-05, "loss": 1.8212, "step": 195000 }, { "epoch": 1.0056325711787248, "grad_norm": 2.3557846546173096, "learning_rate": 8.127060606060607e-05, "loss": 1.8092, "step": 195500 }, { "epoch": 1.0082045214886448, "grad_norm": 2.4398655891418457, "learning_rate": 8.122010101010101e-05, "loss": 1.8157, "step": 196000 }, { "epoch": 1.0107764717985648, "grad_norm": 2.373342275619507, "learning_rate": 8.116959595959597e-05, "loss": 1.811, "step": 196500 }, { "epoch": 1.013348422108485, "grad_norm": 2.491063356399536, "learning_rate": 8.111909090909092e-05, "loss": 1.8079, "step": 197000 }, { "epoch": 1.015920372418405, "grad_norm": 2.996239185333252, "learning_rate": 8.106858585858586e-05, "loss": 1.8104, "step": 197500 }, { "epoch": 1.018492322728325, "grad_norm": 2.259913921356201, "learning_rate": 8.101818181818182e-05, "loss": 1.8086, "step": 198000 }, { "epoch": 1.0210642730382449, "grad_norm": 2.3475708961486816, "learning_rate": 8.096767676767677e-05, "loss": 1.8044, "step": 198500 }, { "epoch": 1.0236362233481648, "grad_norm": 1.893655776977539, "learning_rate": 8.091717171717173e-05, "loss": 1.8083, "step": 199000 }, { "epoch": 1.026208173658085, "grad_norm": 2.151472806930542, "learning_rate": 8.086666666666666e-05, "loss": 1.8026, "step": 199500 }, { "epoch": 1.028780123968005, "grad_norm": 2.5114681720733643, "learning_rate": 8.081616161616162e-05, "loss": 1.7933, "step": 200000 }, { "epoch": 1.031352074277925, "grad_norm": 2.255035400390625, "learning_rate": 8.076565656565657e-05, "loss": 1.8041, "step": 200500 }, { "epoch": 1.033924024587845, "grad_norm": 2.479146957397461, "learning_rate": 8.071525252525253e-05, "loss": 1.7984, "step": 201000 }, { "epoch": 1.036495974897765, "grad_norm": 2.6387994289398193, "learning_rate": 8.066474747474749e-05, "loss": 1.8026, "step": 201500 }, { "epoch": 1.039067925207685, "grad_norm": 2.15395188331604, "learning_rate": 8.061424242424242e-05, "loss": 1.8088, "step": 202000 }, { "epoch": 1.041639875517605, "grad_norm": 2.761543035507202, "learning_rate": 8.056373737373738e-05, "loss": 1.8023, "step": 202500 }, { "epoch": 1.044211825827525, "grad_norm": 2.5639731884002686, "learning_rate": 8.051333333333334e-05, "loss": 1.8009, "step": 203000 }, { "epoch": 1.046783776137445, "grad_norm": 2.1359119415283203, "learning_rate": 8.046282828282829e-05, "loss": 1.8206, "step": 203500 }, { "epoch": 1.049355726447365, "grad_norm": 2.0918943881988525, "learning_rate": 8.041232323232323e-05, "loss": 1.7956, "step": 204000 }, { "epoch": 1.051927676757285, "grad_norm": 2.1521031856536865, "learning_rate": 8.036181818181818e-05, "loss": 1.8062, "step": 204500 }, { "epoch": 1.0544996270672051, "grad_norm": 2.2172553539276123, "learning_rate": 8.031131313131314e-05, "loss": 1.7936, "step": 205000 }, { "epoch": 1.057071577377125, "grad_norm": 3.1185765266418457, "learning_rate": 8.026080808080809e-05, "loss": 1.7966, "step": 205500 }, { "epoch": 1.059643527687045, "grad_norm": 2.084747314453125, "learning_rate": 8.021030303030303e-05, "loss": 1.7851, "step": 206000 }, { "epoch": 1.062215477996965, "grad_norm": 2.4494941234588623, "learning_rate": 8.015979797979798e-05, "loss": 1.7943, "step": 206500 }, { "epoch": 1.064787428306885, "grad_norm": 2.62510347366333, "learning_rate": 8.010929292929294e-05, "loss": 1.7931, "step": 207000 }, { "epoch": 1.0673593786168052, "grad_norm": 2.6288397312164307, "learning_rate": 8.00588888888889e-05, "loss": 1.7964, "step": 207500 }, { "epoch": 1.0699313289267252, "grad_norm": 2.5375521183013916, "learning_rate": 8.000838383838384e-05, "loss": 1.8035, "step": 208000 }, { "epoch": 1.0725032792366451, "grad_norm": 2.3402857780456543, "learning_rate": 7.995787878787879e-05, "loss": 1.7847, "step": 208500 }, { "epoch": 1.075075229546565, "grad_norm": 2.824528455734253, "learning_rate": 7.990737373737374e-05, "loss": 1.7905, "step": 209000 }, { "epoch": 1.077647179856485, "grad_norm": 2.478386878967285, "learning_rate": 7.98568686868687e-05, "loss": 1.7894, "step": 209500 }, { "epoch": 1.0802191301664053, "grad_norm": 2.576979398727417, "learning_rate": 7.980636363636363e-05, "loss": 1.7866, "step": 210000 }, { "epoch": 1.0827910804763252, "grad_norm": 2.5241525173187256, "learning_rate": 7.975585858585859e-05, "loss": 1.7895, "step": 210500 }, { "epoch": 1.0853630307862452, "grad_norm": 2.5618913173675537, "learning_rate": 7.970535353535355e-05, "loss": 1.7836, "step": 211000 }, { "epoch": 1.0879349810961652, "grad_norm": 2.0089547634124756, "learning_rate": 7.96549494949495e-05, "loss": 1.7974, "step": 211500 }, { "epoch": 1.0905069314060851, "grad_norm": 2.360208034515381, "learning_rate": 7.960444444444444e-05, "loss": 1.7778, "step": 212000 }, { "epoch": 1.0930788817160053, "grad_norm": 2.1004722118377686, "learning_rate": 7.95539393939394e-05, "loss": 1.7774, "step": 212500 }, { "epoch": 1.0956508320259253, "grad_norm": 2.2082858085632324, "learning_rate": 7.950353535353535e-05, "loss": 1.7809, "step": 213000 }, { "epoch": 1.0982227823358452, "grad_norm": 2.4933605194091797, "learning_rate": 7.945303030303031e-05, "loss": 1.7884, "step": 213500 }, { "epoch": 1.1007947326457652, "grad_norm": 2.1621594429016113, "learning_rate": 7.940252525252527e-05, "loss": 1.7787, "step": 214000 }, { "epoch": 1.1033666829556852, "grad_norm": 2.569934368133545, "learning_rate": 7.93520202020202e-05, "loss": 1.7805, "step": 214500 }, { "epoch": 1.1059386332656054, "grad_norm": 2.512706756591797, "learning_rate": 7.930151515151516e-05, "loss": 1.7897, "step": 215000 }, { "epoch": 1.1085105835755253, "grad_norm": 2.0574967861175537, "learning_rate": 7.92510101010101e-05, "loss": 1.7697, "step": 215500 }, { "epoch": 1.1110825338854453, "grad_norm": 2.4195003509521484, "learning_rate": 7.920060606060607e-05, "loss": 1.7765, "step": 216000 }, { "epoch": 1.1136544841953653, "grad_norm": 2.6895534992218018, "learning_rate": 7.915010101010101e-05, "loss": 1.7842, "step": 216500 }, { "epoch": 1.1162264345052852, "grad_norm": 2.3295652866363525, "learning_rate": 7.909959595959596e-05, "loss": 1.7801, "step": 217000 }, { "epoch": 1.1187983848152054, "grad_norm": 2.4626710414886475, "learning_rate": 7.904909090909092e-05, "loss": 1.7863, "step": 217500 }, { "epoch": 1.1213703351251254, "grad_norm": 2.438185214996338, "learning_rate": 7.899858585858587e-05, "loss": 1.7744, "step": 218000 }, { "epoch": 1.1239422854350454, "grad_norm": 2.2876017093658447, "learning_rate": 7.894808080808081e-05, "loss": 1.7816, "step": 218500 }, { "epoch": 1.1265142357449653, "grad_norm": 2.7953882217407227, "learning_rate": 7.889757575757576e-05, "loss": 1.7838, "step": 219000 }, { "epoch": 1.1290861860548853, "grad_norm": 2.5806899070739746, "learning_rate": 7.884717171717172e-05, "loss": 1.7777, "step": 219500 }, { "epoch": 1.1316581363648055, "grad_norm": 2.28183650970459, "learning_rate": 7.879666666666668e-05, "loss": 1.7922, "step": 220000 }, { "epoch": 1.1342300866747255, "grad_norm": 2.3127825260162354, "learning_rate": 7.874616161616162e-05, "loss": 1.7663, "step": 220500 }, { "epoch": 1.1368020369846454, "grad_norm": 2.4055662155151367, "learning_rate": 7.869565656565657e-05, "loss": 1.7769, "step": 221000 }, { "epoch": 1.1393739872945654, "grad_norm": 2.1033191680908203, "learning_rate": 7.864515151515152e-05, "loss": 1.7832, "step": 221500 }, { "epoch": 1.1419459376044854, "grad_norm": 2.047595500946045, "learning_rate": 7.859474747474748e-05, "loss": 1.7693, "step": 222000 }, { "epoch": 1.1445178879144056, "grad_norm": 2.706106424331665, "learning_rate": 7.854424242424244e-05, "loss": 1.7778, "step": 222500 }, { "epoch": 1.1470898382243255, "grad_norm": 2.076641798019409, "learning_rate": 7.849373737373737e-05, "loss": 1.7678, "step": 223000 }, { "epoch": 1.1496617885342455, "grad_norm": 2.578556537628174, "learning_rate": 7.844323232323233e-05, "loss": 1.7795, "step": 223500 }, { "epoch": 1.1522337388441655, "grad_norm": 2.0416908264160156, "learning_rate": 7.839272727272727e-05, "loss": 1.7665, "step": 224000 }, { "epoch": 1.1548056891540854, "grad_norm": 2.5179026126861572, "learning_rate": 7.834232323232323e-05, "loss": 1.7608, "step": 224500 }, { "epoch": 1.1573776394640056, "grad_norm": 2.2774341106414795, "learning_rate": 7.82918181818182e-05, "loss": 1.7567, "step": 225000 }, { "epoch": 1.1599495897739256, "grad_norm": 2.177483558654785, "learning_rate": 7.824131313131313e-05, "loss": 1.7672, "step": 225500 }, { "epoch": 1.1625215400838456, "grad_norm": 2.516448736190796, "learning_rate": 7.819080808080809e-05, "loss": 1.7576, "step": 226000 }, { "epoch": 1.1650934903937655, "grad_norm": 2.2014214992523193, "learning_rate": 7.814030303030303e-05, "loss": 1.7662, "step": 226500 }, { "epoch": 1.1676654407036855, "grad_norm": 2.2554168701171875, "learning_rate": 7.808979797979798e-05, "loss": 1.7719, "step": 227000 }, { "epoch": 1.1702373910136057, "grad_norm": 2.5222623348236084, "learning_rate": 7.803939393939394e-05, "loss": 1.777, "step": 227500 }, { "epoch": 1.1728093413235257, "grad_norm": 2.1105360984802246, "learning_rate": 7.798888888888889e-05, "loss": 1.7654, "step": 228000 }, { "epoch": 1.1753812916334456, "grad_norm": 2.4991660118103027, "learning_rate": 7.793838383838385e-05, "loss": 1.7622, "step": 228500 }, { "epoch": 1.1779532419433656, "grad_norm": 2.394397258758545, "learning_rate": 7.788787878787879e-05, "loss": 1.763, "step": 229000 }, { "epoch": 1.1805251922532856, "grad_norm": 2.5834200382232666, "learning_rate": 7.783737373737374e-05, "loss": 1.7636, "step": 229500 }, { "epoch": 1.1830971425632058, "grad_norm": 2.1750988960266113, "learning_rate": 7.778686868686868e-05, "loss": 1.7712, "step": 230000 }, { "epoch": 1.1856690928731257, "grad_norm": 2.460362195968628, "learning_rate": 7.773636363636364e-05, "loss": 1.7695, "step": 230500 }, { "epoch": 1.1882410431830457, "grad_norm": 2.492896795272827, "learning_rate": 7.768585858585858e-05, "loss": 1.7628, "step": 231000 }, { "epoch": 1.1908129934929657, "grad_norm": 2.5049636363983154, "learning_rate": 7.763545454545455e-05, "loss": 1.7595, "step": 231500 }, { "epoch": 1.1933849438028856, "grad_norm": 2.638702630996704, "learning_rate": 7.75849494949495e-05, "loss": 1.7716, "step": 232000 }, { "epoch": 1.1959568941128058, "grad_norm": 2.3910155296325684, "learning_rate": 7.753444444444444e-05, "loss": 1.7682, "step": 232500 }, { "epoch": 1.1985288444227258, "grad_norm": 2.247044563293457, "learning_rate": 7.74840404040404e-05, "loss": 1.7625, "step": 233000 }, { "epoch": 1.2011007947326457, "grad_norm": 2.289677858352661, "learning_rate": 7.743353535353536e-05, "loss": 1.7632, "step": 233500 }, { "epoch": 1.2036727450425657, "grad_norm": 2.5424296855926514, "learning_rate": 7.73830303030303e-05, "loss": 1.7672, "step": 234000 }, { "epoch": 1.2062446953524857, "grad_norm": 2.1238250732421875, "learning_rate": 7.733252525252526e-05, "loss": 1.7547, "step": 234500 }, { "epoch": 1.2088166456624059, "grad_norm": 2.2579052448272705, "learning_rate": 7.728202020202022e-05, "loss": 1.7606, "step": 235000 }, { "epoch": 1.2113885959723258, "grad_norm": 2.3846943378448486, "learning_rate": 7.723151515151515e-05, "loss": 1.744, "step": 235500 }, { "epoch": 1.2139605462822458, "grad_norm": 2.23209547996521, "learning_rate": 7.718101010101011e-05, "loss": 1.7643, "step": 236000 }, { "epoch": 1.2165324965921658, "grad_norm": 2.6672916412353516, "learning_rate": 7.713050505050505e-05, "loss": 1.7561, "step": 236500 }, { "epoch": 1.2191044469020857, "grad_norm": 2.5802114009857178, "learning_rate": 7.708010101010101e-05, "loss": 1.7613, "step": 237000 }, { "epoch": 1.221676397212006, "grad_norm": 2.311035633087158, "learning_rate": 7.702959595959597e-05, "loss": 1.7536, "step": 237500 }, { "epoch": 1.224248347521926, "grad_norm": 2.2888970375061035, "learning_rate": 7.697919191919192e-05, "loss": 1.7454, "step": 238000 }, { "epoch": 1.2268202978318459, "grad_norm": 2.203408718109131, "learning_rate": 7.692868686868687e-05, "loss": 1.7496, "step": 238500 }, { "epoch": 1.2293922481417658, "grad_norm": 2.1793553829193115, "learning_rate": 7.687818181818183e-05, "loss": 1.7681, "step": 239000 }, { "epoch": 1.2319641984516858, "grad_norm": 2.3608551025390625, "learning_rate": 7.682767676767677e-05, "loss": 1.7521, "step": 239500 }, { "epoch": 1.234536148761606, "grad_norm": 2.602651834487915, "learning_rate": 7.677717171717172e-05, "loss": 1.7689, "step": 240000 }, { "epoch": 1.237108099071526, "grad_norm": 2.261465311050415, "learning_rate": 7.672666666666667e-05, "loss": 1.7514, "step": 240500 }, { "epoch": 1.239680049381446, "grad_norm": 2.375920057296753, "learning_rate": 7.667616161616162e-05, "loss": 1.7579, "step": 241000 }, { "epoch": 1.242251999691366, "grad_norm": 2.47737979888916, "learning_rate": 7.662575757575758e-05, "loss": 1.7576, "step": 241500 }, { "epoch": 1.2448239500012859, "grad_norm": 2.7517123222351074, "learning_rate": 7.657525252525253e-05, "loss": 1.7527, "step": 242000 }, { "epoch": 1.247395900311206, "grad_norm": 2.765855073928833, "learning_rate": 7.652474747474748e-05, "loss": 1.7442, "step": 242500 }, { "epoch": 1.249967850621126, "grad_norm": 2.3727500438690186, "learning_rate": 7.647424242424242e-05, "loss": 1.7513, "step": 243000 }, { "epoch": 1.252539800931046, "grad_norm": 2.3826792240142822, "learning_rate": 7.642373737373738e-05, "loss": 1.7539, "step": 243500 }, { "epoch": 1.255111751240966, "grad_norm": 2.1369845867156982, "learning_rate": 7.637323232323233e-05, "loss": 1.7457, "step": 244000 }, { "epoch": 1.257683701550886, "grad_norm": 2.8363897800445557, "learning_rate": 7.632272727272728e-05, "loss": 1.7489, "step": 244500 }, { "epoch": 1.2602556518608061, "grad_norm": 2.043923854827881, "learning_rate": 7.627232323232324e-05, "loss": 1.7399, "step": 245000 }, { "epoch": 1.262827602170726, "grad_norm": 2.7618696689605713, "learning_rate": 7.622181818181818e-05, "loss": 1.7444, "step": 245500 }, { "epoch": 1.265399552480646, "grad_norm": 2.689225435256958, "learning_rate": 7.617131313131314e-05, "loss": 1.7489, "step": 246000 }, { "epoch": 1.267971502790566, "grad_norm": 2.448422908782959, "learning_rate": 7.612080808080807e-05, "loss": 1.7428, "step": 246500 }, { "epoch": 1.270543453100486, "grad_norm": 2.5466957092285156, "learning_rate": 7.607030303030303e-05, "loss": 1.7463, "step": 247000 }, { "epoch": 1.2731154034104062, "grad_norm": 2.244110107421875, "learning_rate": 7.6019898989899e-05, "loss": 1.7467, "step": 247500 }, { "epoch": 1.2756873537203262, "grad_norm": 2.1423609256744385, "learning_rate": 7.596939393939394e-05, "loss": 1.7502, "step": 248000 }, { "epoch": 1.2782593040302461, "grad_norm": 2.408640146255493, "learning_rate": 7.59188888888889e-05, "loss": 1.7405, "step": 248500 }, { "epoch": 1.280831254340166, "grad_norm": 2.5381617546081543, "learning_rate": 7.586838383838383e-05, "loss": 1.7383, "step": 249000 }, { "epoch": 1.283403204650086, "grad_norm": 2.206977128982544, "learning_rate": 7.581787878787879e-05, "loss": 1.7377, "step": 249500 }, { "epoch": 1.2859751549600063, "grad_norm": 2.2149858474731445, "learning_rate": 7.576737373737374e-05, "loss": 1.7362, "step": 250000 }, { "epoch": 1.2885471052699262, "grad_norm": 2.614354372024536, "learning_rate": 7.571686868686869e-05, "loss": 1.7517, "step": 250500 }, { "epoch": 1.2911190555798462, "grad_norm": 2.1546077728271484, "learning_rate": 7.566646464646465e-05, "loss": 1.7281, "step": 251000 }, { "epoch": 1.2936910058897662, "grad_norm": 2.150606632232666, "learning_rate": 7.561595959595959e-05, "loss": 1.7525, "step": 251500 }, { "epoch": 1.2962629561996861, "grad_norm": 2.4622044563293457, "learning_rate": 7.556545454545455e-05, "loss": 1.7407, "step": 252000 }, { "epoch": 1.2988349065096063, "grad_norm": 2.383789300918579, "learning_rate": 7.55149494949495e-05, "loss": 1.7401, "step": 252500 }, { "epoch": 1.3014068568195263, "grad_norm": 2.7778983116149902, "learning_rate": 7.546454545454546e-05, "loss": 1.7298, "step": 253000 }, { "epoch": 1.3039788071294462, "grad_norm": 2.69973087310791, "learning_rate": 7.54140404040404e-05, "loss": 1.7298, "step": 253500 }, { "epoch": 1.3065507574393662, "grad_norm": 2.866455554962158, "learning_rate": 7.536353535353535e-05, "loss": 1.7421, "step": 254000 }, { "epoch": 1.3091227077492862, "grad_norm": 2.307335615158081, "learning_rate": 7.531303030303031e-05, "loss": 1.7427, "step": 254500 }, { "epoch": 1.3116946580592064, "grad_norm": 2.242201089859009, "learning_rate": 7.526252525252526e-05, "loss": 1.7406, "step": 255000 }, { "epoch": 1.3142666083691263, "grad_norm": 2.3447513580322266, "learning_rate": 7.52120202020202e-05, "loss": 1.7219, "step": 255500 }, { "epoch": 1.3168385586790463, "grad_norm": 2.4869656562805176, "learning_rate": 7.516151515151516e-05, "loss": 1.7247, "step": 256000 }, { "epoch": 1.3194105089889663, "grad_norm": 3.0479238033294678, "learning_rate": 7.511101010101011e-05, "loss": 1.7387, "step": 256500 }, { "epoch": 1.3219824592988862, "grad_norm": 2.106835126876831, "learning_rate": 7.506060606060607e-05, "loss": 1.7436, "step": 257000 }, { "epoch": 1.3245544096088064, "grad_norm": 2.6086888313293457, "learning_rate": 7.5010101010101e-05, "loss": 1.7299, "step": 257500 }, { "epoch": 1.3271263599187264, "grad_norm": 2.5068061351776123, "learning_rate": 7.495959595959596e-05, "loss": 1.727, "step": 258000 }, { "epoch": 1.3296983102286464, "grad_norm": 2.0098962783813477, "learning_rate": 7.490909090909092e-05, "loss": 1.7233, "step": 258500 }, { "epoch": 1.3322702605385663, "grad_norm": 2.0728952884674072, "learning_rate": 7.485858585858587e-05, "loss": 1.7053, "step": 259000 }, { "epoch": 1.3348422108484863, "grad_norm": 2.0596702098846436, "learning_rate": 7.480808080808081e-05, "loss": 1.7309, "step": 259500 }, { "epoch": 1.3374141611584065, "grad_norm": 2.2352986335754395, "learning_rate": 7.475757575757576e-05, "loss": 1.7363, "step": 260000 }, { "epoch": 1.3399861114683265, "grad_norm": 2.318910598754883, "learning_rate": 7.470707070707072e-05, "loss": 1.7329, "step": 260500 }, { "epoch": 1.3425580617782464, "grad_norm": 2.536661148071289, "learning_rate": 7.465666666666668e-05, "loss": 1.7263, "step": 261000 }, { "epoch": 1.3451300120881664, "grad_norm": 2.216972827911377, "learning_rate": 7.460616161616161e-05, "loss": 1.7328, "step": 261500 }, { "epoch": 1.3477019623980864, "grad_norm": 2.4291155338287354, "learning_rate": 7.455565656565657e-05, "loss": 1.7299, "step": 262000 }, { "epoch": 1.3502739127080066, "grad_norm": 2.5120067596435547, "learning_rate": 7.450515151515152e-05, "loss": 1.7398, "step": 262500 }, { "epoch": 1.3528458630179265, "grad_norm": 2.61008358001709, "learning_rate": 7.445474747474748e-05, "loss": 1.7333, "step": 263000 }, { "epoch": 1.3554178133278465, "grad_norm": 2.112347364425659, "learning_rate": 7.440424242424244e-05, "loss": 1.7215, "step": 263500 }, { "epoch": 1.3579897636377665, "grad_norm": 2.860222339630127, "learning_rate": 7.435373737373737e-05, "loss": 1.727, "step": 264000 }, { "epoch": 1.3605617139476864, "grad_norm": 2.319789171218872, "learning_rate": 7.430323232323233e-05, "loss": 1.7278, "step": 264500 }, { "epoch": 1.3631336642576066, "grad_norm": 2.808403253555298, "learning_rate": 7.425282828282829e-05, "loss": 1.7404, "step": 265000 }, { "epoch": 1.3657056145675266, "grad_norm": 2.207468271255493, "learning_rate": 7.420232323232324e-05, "loss": 1.7247, "step": 265500 }, { "epoch": 1.3682775648774466, "grad_norm": 3.101154327392578, "learning_rate": 7.415181818181818e-05, "loss": 1.7326, "step": 266000 }, { "epoch": 1.3708495151873665, "grad_norm": 2.5844483375549316, "learning_rate": 7.410131313131313e-05, "loss": 1.7153, "step": 266500 }, { "epoch": 1.3734214654972865, "grad_norm": 2.1961023807525635, "learning_rate": 7.405090909090909e-05, "loss": 1.7174, "step": 267000 }, { "epoch": 1.3759934158072067, "grad_norm": 2.372945785522461, "learning_rate": 7.400050505050505e-05, "loss": 1.728, "step": 267500 }, { "epoch": 1.3785653661171267, "grad_norm": 2.262930154800415, "learning_rate": 7.395000000000001e-05, "loss": 1.7088, "step": 268000 }, { "epoch": 1.3811373164270466, "grad_norm": 2.2142205238342285, "learning_rate": 7.389949494949495e-05, "loss": 1.7111, "step": 268500 }, { "epoch": 1.3837092667369666, "grad_norm": 3.059236526489258, "learning_rate": 7.38489898989899e-05, "loss": 1.7179, "step": 269000 }, { "epoch": 1.3862812170468866, "grad_norm": 2.1427500247955322, "learning_rate": 7.379848484848485e-05, "loss": 1.722, "step": 269500 }, { "epoch": 1.3888531673568068, "grad_norm": 2.4149832725524902, "learning_rate": 7.374808080808081e-05, "loss": 1.7259, "step": 270000 }, { "epoch": 1.3914251176667267, "grad_norm": 2.1872212886810303, "learning_rate": 7.369757575757577e-05, "loss": 1.7188, "step": 270500 }, { "epoch": 1.3939970679766467, "grad_norm": 2.333991289138794, "learning_rate": 7.364707070707071e-05, "loss": 1.7222, "step": 271000 }, { "epoch": 1.3965690182865667, "grad_norm": 2.5313849449157715, "learning_rate": 7.359656565656566e-05, "loss": 1.7184, "step": 271500 }, { "epoch": 1.3991409685964866, "grad_norm": 2.467475175857544, "learning_rate": 7.35460606060606e-05, "loss": 1.7288, "step": 272000 }, { "epoch": 1.4017129189064068, "grad_norm": 2.3604865074157715, "learning_rate": 7.349555555555557e-05, "loss": 1.7194, "step": 272500 }, { "epoch": 1.4042848692163268, "grad_norm": 2.3482818603515625, "learning_rate": 7.34450505050505e-05, "loss": 1.7148, "step": 273000 }, { "epoch": 1.4068568195262467, "grad_norm": 2.384766101837158, "learning_rate": 7.339454545454546e-05, "loss": 1.7046, "step": 273500 }, { "epoch": 1.4094287698361667, "grad_norm": 2.6986968517303467, "learning_rate": 7.334414141414142e-05, "loss": 1.7137, "step": 274000 }, { "epoch": 1.4120007201460867, "grad_norm": 2.383161783218384, "learning_rate": 7.329373737373738e-05, "loss": 1.7206, "step": 274500 }, { "epoch": 1.4145726704560069, "grad_norm": 2.5386579036712646, "learning_rate": 7.324323232323232e-05, "loss": 1.7127, "step": 275000 }, { "epoch": 1.4171446207659268, "grad_norm": 2.8972415924072266, "learning_rate": 7.319272727272728e-05, "loss": 1.7088, "step": 275500 }, { "epoch": 1.4197165710758468, "grad_norm": 2.8067967891693115, "learning_rate": 7.314222222222222e-05, "loss": 1.7177, "step": 276000 }, { "epoch": 1.4222885213857668, "grad_norm": 1.916225552558899, "learning_rate": 7.309171717171718e-05, "loss": 1.7019, "step": 276500 }, { "epoch": 1.4248604716956867, "grad_norm": 3.040851354598999, "learning_rate": 7.304121212121212e-05, "loss": 1.7041, "step": 277000 }, { "epoch": 1.427432422005607, "grad_norm": 2.5603034496307373, "learning_rate": 7.299070707070707e-05, "loss": 1.7071, "step": 277500 }, { "epoch": 1.430004372315527, "grad_norm": 3.5265140533447266, "learning_rate": 7.294030303030304e-05, "loss": 1.711, "step": 278000 }, { "epoch": 1.4325763226254469, "grad_norm": 2.5686593055725098, "learning_rate": 7.2889898989899e-05, "loss": 1.7125, "step": 278500 }, { "epoch": 1.4351482729353668, "grad_norm": 2.419116735458374, "learning_rate": 7.283939393939393e-05, "loss": 1.702, "step": 279000 }, { "epoch": 1.4377202232452868, "grad_norm": 2.6491827964782715, "learning_rate": 7.27888888888889e-05, "loss": 1.7198, "step": 279500 }, { "epoch": 1.440292173555207, "grad_norm": 2.181264638900757, "learning_rate": 7.273838383838384e-05, "loss": 1.7124, "step": 280000 }, { "epoch": 1.442864123865127, "grad_norm": 2.609100580215454, "learning_rate": 7.268787878787879e-05, "loss": 1.727, "step": 280500 }, { "epoch": 1.445436074175047, "grad_norm": 2.866640329360962, "learning_rate": 7.263747474747476e-05, "loss": 1.7117, "step": 281000 }, { "epoch": 1.448008024484967, "grad_norm": 2.657816171646118, "learning_rate": 7.25869696969697e-05, "loss": 1.7248, "step": 281500 }, { "epoch": 1.4505799747948869, "grad_norm": 2.376187801361084, "learning_rate": 7.253646464646465e-05, "loss": 1.7056, "step": 282000 }, { "epoch": 1.453151925104807, "grad_norm": 2.379953622817993, "learning_rate": 7.24859595959596e-05, "loss": 1.7167, "step": 282500 }, { "epoch": 1.455723875414727, "grad_norm": 2.7846200466156006, "learning_rate": 7.243545454545455e-05, "loss": 1.7134, "step": 283000 }, { "epoch": 1.458295825724647, "grad_norm": 2.3728222846984863, "learning_rate": 7.238494949494949e-05, "loss": 1.6974, "step": 283500 }, { "epoch": 1.460867776034567, "grad_norm": 2.185354232788086, "learning_rate": 7.233444444444445e-05, "loss": 1.7095, "step": 284000 }, { "epoch": 1.463439726344487, "grad_norm": 2.393312454223633, "learning_rate": 7.22839393939394e-05, "loss": 1.6992, "step": 284500 }, { "epoch": 1.4660116766544071, "grad_norm": 2.4728591442108154, "learning_rate": 7.223343434343434e-05, "loss": 1.7096, "step": 285000 }, { "epoch": 1.468583626964327, "grad_norm": 2.379149913787842, "learning_rate": 7.21830303030303e-05, "loss": 1.7051, "step": 285500 }, { "epoch": 1.471155577274247, "grad_norm": 2.3946895599365234, "learning_rate": 7.213252525252525e-05, "loss": 1.7051, "step": 286000 }, { "epoch": 1.473727527584167, "grad_norm": 2.4574227333068848, "learning_rate": 7.208202020202021e-05, "loss": 1.7048, "step": 286500 }, { "epoch": 1.476299477894087, "grad_norm": 2.5250046253204346, "learning_rate": 7.203151515151514e-05, "loss": 1.7008, "step": 287000 }, { "epoch": 1.4788714282040072, "grad_norm": 2.5990653038024902, "learning_rate": 7.198111111111112e-05, "loss": 1.6975, "step": 287500 }, { "epoch": 1.4814433785139272, "grad_norm": 2.3256866931915283, "learning_rate": 7.193060606060606e-05, "loss": 1.6982, "step": 288000 }, { "epoch": 1.4840153288238471, "grad_norm": 2.4116110801696777, "learning_rate": 7.188010101010101e-05, "loss": 1.7023, "step": 288500 }, { "epoch": 1.486587279133767, "grad_norm": 2.2912509441375732, "learning_rate": 7.182959595959597e-05, "loss": 1.6999, "step": 289000 }, { "epoch": 1.489159229443687, "grad_norm": 2.7787649631500244, "learning_rate": 7.177909090909092e-05, "loss": 1.6979, "step": 289500 }, { "epoch": 1.4917311797536073, "grad_norm": 2.0487236976623535, "learning_rate": 7.172858585858586e-05, "loss": 1.697, "step": 290000 }, { "epoch": 1.4943031300635272, "grad_norm": 2.3088083267211914, "learning_rate": 7.167808080808082e-05, "loss": 1.6906, "step": 290500 }, { "epoch": 1.4968750803734472, "grad_norm": 2.1930689811706543, "learning_rate": 7.162767676767677e-05, "loss": 1.71, "step": 291000 }, { "epoch": 1.4994470306833672, "grad_norm": 2.6284825801849365, "learning_rate": 7.157717171717171e-05, "loss": 1.704, "step": 291500 }, { "epoch": 1.5020189809932871, "grad_norm": 2.0390841960906982, "learning_rate": 7.152676767676769e-05, "loss": 1.7005, "step": 292000 }, { "epoch": 1.5045909313032073, "grad_norm": 2.472266674041748, "learning_rate": 7.147626262626262e-05, "loss": 1.6911, "step": 292500 }, { "epoch": 1.5071628816131273, "grad_norm": 2.0675249099731445, "learning_rate": 7.142575757575758e-05, "loss": 1.7018, "step": 293000 }, { "epoch": 1.5097348319230472, "grad_norm": 2.693594217300415, "learning_rate": 7.137525252525254e-05, "loss": 1.6849, "step": 293500 }, { "epoch": 1.5123067822329672, "grad_norm": 2.4996039867401123, "learning_rate": 7.132474747474747e-05, "loss": 1.7032, "step": 294000 }, { "epoch": 1.5148787325428872, "grad_norm": 2.3143088817596436, "learning_rate": 7.127424242424243e-05, "loss": 1.702, "step": 294500 }, { "epoch": 1.5174506828528074, "grad_norm": 2.636171340942383, "learning_rate": 7.122373737373738e-05, "loss": 1.6903, "step": 295000 }, { "epoch": 1.5200226331627273, "grad_norm": 2.3447632789611816, "learning_rate": 7.117323232323233e-05, "loss": 1.7032, "step": 295500 }, { "epoch": 1.5225945834726473, "grad_norm": 1.977137565612793, "learning_rate": 7.112272727272727e-05, "loss": 1.6845, "step": 296000 }, { "epoch": 1.5251665337825673, "grad_norm": 2.250196695327759, "learning_rate": 7.107232323232323e-05, "loss": 1.687, "step": 296500 }, { "epoch": 1.5277384840924872, "grad_norm": 2.750044345855713, "learning_rate": 7.102181818181819e-05, "loss": 1.6999, "step": 297000 }, { "epoch": 1.5303104344024074, "grad_norm": 2.4571657180786133, "learning_rate": 7.097131313131314e-05, "loss": 1.6919, "step": 297500 }, { "epoch": 1.5328823847123274, "grad_norm": 2.9166290760040283, "learning_rate": 7.092080808080808e-05, "loss": 1.6857, "step": 298000 }, { "epoch": 1.5354543350222474, "grad_norm": 2.9264209270477295, "learning_rate": 7.087040404040404e-05, "loss": 1.6778, "step": 298500 }, { "epoch": 1.5380262853321673, "grad_norm": 2.910644769668579, "learning_rate": 7.081989898989899e-05, "loss": 1.6869, "step": 299000 }, { "epoch": 1.5405982356420873, "grad_norm": 2.3062753677368164, "learning_rate": 7.076939393939395e-05, "loss": 1.707, "step": 299500 }, { "epoch": 1.5431701859520075, "grad_norm": 2.345658302307129, "learning_rate": 7.07188888888889e-05, "loss": 1.6887, "step": 300000 }, { "epoch": 1.5457421362619275, "grad_norm": 2.5615222454071045, "learning_rate": 7.066838383838384e-05, "loss": 1.6918, "step": 300500 }, { "epoch": 1.5483140865718474, "grad_norm": 2.4387075901031494, "learning_rate": 7.061787878787879e-05, "loss": 1.6967, "step": 301000 }, { "epoch": 1.5508860368817674, "grad_norm": 2.2662642002105713, "learning_rate": 7.056737373737375e-05, "loss": 1.7053, "step": 301500 }, { "epoch": 1.5534579871916874, "grad_norm": 2.526573896408081, "learning_rate": 7.051686868686868e-05, "loss": 1.6866, "step": 302000 }, { "epoch": 1.5560299375016076, "grad_norm": 2.2950527667999268, "learning_rate": 7.046656565656567e-05, "loss": 1.683, "step": 302500 }, { "epoch": 1.5586018878115275, "grad_norm": 2.3456244468688965, "learning_rate": 7.041606060606061e-05, "loss": 1.6864, "step": 303000 }, { "epoch": 1.5611738381214475, "grad_norm": 2.326719284057617, "learning_rate": 7.036555555555556e-05, "loss": 1.6894, "step": 303500 }, { "epoch": 1.5637457884313675, "grad_norm": 2.5892398357391357, "learning_rate": 7.031515151515152e-05, "loss": 1.6853, "step": 304000 }, { "epoch": 1.5663177387412874, "grad_norm": 2.476912260055542, "learning_rate": 7.026464646464647e-05, "loss": 1.6815, "step": 304500 }, { "epoch": 1.5688896890512076, "grad_norm": 2.147064685821533, "learning_rate": 7.021414141414143e-05, "loss": 1.6883, "step": 305000 }, { "epoch": 1.5714616393611276, "grad_norm": 2.761141061782837, "learning_rate": 7.016363636363636e-05, "loss": 1.679, "step": 305500 }, { "epoch": 1.5740335896710476, "grad_norm": 2.316796064376831, "learning_rate": 7.011313131313132e-05, "loss": 1.6925, "step": 306000 }, { "epoch": 1.5766055399809675, "grad_norm": 2.4468626976013184, "learning_rate": 7.006262626262627e-05, "loss": 1.6923, "step": 306500 }, { "epoch": 1.5791774902908875, "grad_norm": 2.4432520866394043, "learning_rate": 7.001212121212121e-05, "loss": 1.6863, "step": 307000 }, { "epoch": 1.5817494406008077, "grad_norm": 2.5849692821502686, "learning_rate": 6.996161616161616e-05, "loss": 1.6831, "step": 307500 }, { "epoch": 1.5843213909107277, "grad_norm": 2.266772985458374, "learning_rate": 6.991111111111112e-05, "loss": 1.6821, "step": 308000 }, { "epoch": 1.5868933412206476, "grad_norm": 2.161853313446045, "learning_rate": 6.986060606060606e-05, "loss": 1.6805, "step": 308500 }, { "epoch": 1.5894652915305676, "grad_norm": 2.5699236392974854, "learning_rate": 6.981010101010101e-05, "loss": 1.6879, "step": 309000 }, { "epoch": 1.5920372418404876, "grad_norm": 2.3673970699310303, "learning_rate": 6.975969696969697e-05, "loss": 1.6765, "step": 309500 }, { "epoch": 1.5946091921504078, "grad_norm": 2.225632667541504, "learning_rate": 6.970919191919192e-05, "loss": 1.6847, "step": 310000 }, { "epoch": 1.5971811424603277, "grad_norm": 2.2272884845733643, "learning_rate": 6.965868686868688e-05, "loss": 1.6769, "step": 310500 }, { "epoch": 1.5997530927702477, "grad_norm": 2.319474458694458, "learning_rate": 6.960818181818182e-05, "loss": 1.6831, "step": 311000 }, { "epoch": 1.6023250430801677, "grad_norm": 2.1718974113464355, "learning_rate": 6.955767676767677e-05, "loss": 1.6638, "step": 311500 }, { "epoch": 1.6048969933900876, "grad_norm": 2.3438401222229004, "learning_rate": 6.950717171717172e-05, "loss": 1.6737, "step": 312000 }, { "epoch": 1.6074689437000078, "grad_norm": 1.9681246280670166, "learning_rate": 6.945666666666668e-05, "loss": 1.6682, "step": 312500 }, { "epoch": 1.6100408940099278, "grad_norm": 2.5999867916107178, "learning_rate": 6.940616161616162e-05, "loss": 1.6861, "step": 313000 }, { "epoch": 1.6126128443198477, "grad_norm": 2.4516825675964355, "learning_rate": 6.935575757575757e-05, "loss": 1.6838, "step": 313500 }, { "epoch": 1.6151847946297677, "grad_norm": 2.1580958366394043, "learning_rate": 6.930525252525253e-05, "loss": 1.6751, "step": 314000 }, { "epoch": 1.6177567449396877, "grad_norm": 2.6636695861816406, "learning_rate": 6.925474747474749e-05, "loss": 1.6781, "step": 314500 }, { "epoch": 1.6203286952496079, "grad_norm": 2.1307785511016846, "learning_rate": 6.920424242424242e-05, "loss": 1.6763, "step": 315000 }, { "epoch": 1.6229006455595278, "grad_norm": 2.4927167892456055, "learning_rate": 6.91538383838384e-05, "loss": 1.6755, "step": 315500 }, { "epoch": 1.6254725958694478, "grad_norm": 1.9655892848968506, "learning_rate": 6.910333333333334e-05, "loss": 1.6839, "step": 316000 }, { "epoch": 1.6280445461793678, "grad_norm": 2.2941057682037354, "learning_rate": 6.905282828282829e-05, "loss": 1.6739, "step": 316500 }, { "epoch": 1.6306164964892877, "grad_norm": 2.4142115116119385, "learning_rate": 6.900232323232325e-05, "loss": 1.6843, "step": 317000 }, { "epoch": 1.633188446799208, "grad_norm": 2.138962745666504, "learning_rate": 6.895181818181818e-05, "loss": 1.6809, "step": 317500 }, { "epoch": 1.635760397109128, "grad_norm": 2.6460509300231934, "learning_rate": 6.890131313131314e-05, "loss": 1.6733, "step": 318000 }, { "epoch": 1.6383323474190479, "grad_norm": 2.2773749828338623, "learning_rate": 6.885080808080809e-05, "loss": 1.6671, "step": 318500 }, { "epoch": 1.6409042977289678, "grad_norm": 2.1762917041778564, "learning_rate": 6.880030303030303e-05, "loss": 1.6649, "step": 319000 }, { "epoch": 1.6434762480388878, "grad_norm": 2.4022064208984375, "learning_rate": 6.874989898989899e-05, "loss": 1.6667, "step": 319500 }, { "epoch": 1.646048198348808, "grad_norm": 2.392923355102539, "learning_rate": 6.869939393939394e-05, "loss": 1.6735, "step": 320000 }, { "epoch": 1.648620148658728, "grad_norm": 2.8275463581085205, "learning_rate": 6.86488888888889e-05, "loss": 1.667, "step": 320500 }, { "epoch": 1.651192098968648, "grad_norm": 2.8365330696105957, "learning_rate": 6.859838383838384e-05, "loss": 1.6766, "step": 321000 }, { "epoch": 1.653764049278568, "grad_norm": 2.6010117530822754, "learning_rate": 6.854787878787879e-05, "loss": 1.6707, "step": 321500 }, { "epoch": 1.6563359995884879, "grad_norm": 2.6623294353485107, "learning_rate": 6.849747474747475e-05, "loss": 1.6676, "step": 322000 }, { "epoch": 1.658907949898408, "grad_norm": 2.760723114013672, "learning_rate": 6.844707070707071e-05, "loss": 1.6634, "step": 322500 }, { "epoch": 1.6614799002083278, "grad_norm": 2.240460157394409, "learning_rate": 6.839656565656566e-05, "loss": 1.6613, "step": 323000 }, { "epoch": 1.664051850518248, "grad_norm": 2.0668253898620605, "learning_rate": 6.834606060606062e-05, "loss": 1.6664, "step": 323500 }, { "epoch": 1.666623800828168, "grad_norm": 2.19256329536438, "learning_rate": 6.829555555555556e-05, "loss": 1.6632, "step": 324000 }, { "epoch": 1.669195751138088, "grad_norm": 2.7215864658355713, "learning_rate": 6.824505050505051e-05, "loss": 1.6662, "step": 324500 }, { "epoch": 1.6717677014480081, "grad_norm": 2.0605878829956055, "learning_rate": 6.819454545454545e-05, "loss": 1.6615, "step": 325000 }, { "epoch": 1.6743396517579279, "grad_norm": 2.1403868198394775, "learning_rate": 6.814404040404041e-05, "loss": 1.6798, "step": 325500 }, { "epoch": 1.676911602067848, "grad_norm": 2.322628974914551, "learning_rate": 6.809353535353535e-05, "loss": 1.6739, "step": 326000 }, { "epoch": 1.679483552377768, "grad_norm": 2.2708230018615723, "learning_rate": 6.804303030303031e-05, "loss": 1.6635, "step": 326500 }, { "epoch": 1.682055502687688, "grad_norm": 2.4940547943115234, "learning_rate": 6.799252525252525e-05, "loss": 1.6679, "step": 327000 }, { "epoch": 1.6846274529976082, "grad_norm": 2.149888038635254, "learning_rate": 6.794202020202021e-05, "loss": 1.6773, "step": 327500 }, { "epoch": 1.687199403307528, "grad_norm": 2.544126272201538, "learning_rate": 6.789151515151515e-05, "loss": 1.6754, "step": 328000 }, { "epoch": 1.6897713536174481, "grad_norm": 2.3829123973846436, "learning_rate": 6.78410101010101e-05, "loss": 1.665, "step": 328500 }, { "epoch": 1.692343303927368, "grad_norm": 2.3244376182556152, "learning_rate": 6.779060606060607e-05, "loss": 1.6724, "step": 329000 }, { "epoch": 1.694915254237288, "grad_norm": 2.288402557373047, "learning_rate": 6.774010101010101e-05, "loss": 1.6534, "step": 329500 }, { "epoch": 1.6974872045472083, "grad_norm": 2.2815768718719482, "learning_rate": 6.768969696969697e-05, "loss": 1.6664, "step": 330000 }, { "epoch": 1.700059154857128, "grad_norm": 2.458909749984741, "learning_rate": 6.763929292929293e-05, "loss": 1.669, "step": 330500 }, { "epoch": 1.7026311051670482, "grad_norm": 2.744945764541626, "learning_rate": 6.758878787878789e-05, "loss": 1.6688, "step": 331000 }, { "epoch": 1.7052030554769682, "grad_norm": 2.7508599758148193, "learning_rate": 6.753828282828282e-05, "loss": 1.6562, "step": 331500 }, { "epoch": 1.7077750057868881, "grad_norm": 2.8219707012176514, "learning_rate": 6.748777777777778e-05, "loss": 1.6542, "step": 332000 }, { "epoch": 1.7103469560968083, "grad_norm": 2.6453421115875244, "learning_rate": 6.743727272727273e-05, "loss": 1.6508, "step": 332500 }, { "epoch": 1.712918906406728, "grad_norm": 2.9267029762268066, "learning_rate": 6.738676767676768e-05, "loss": 1.6559, "step": 333000 }, { "epoch": 1.7154908567166482, "grad_norm": 2.5373966693878174, "learning_rate": 6.733626262626262e-05, "loss": 1.6683, "step": 333500 }, { "epoch": 1.7180628070265682, "grad_norm": 2.3234028816223145, "learning_rate": 6.728575757575758e-05, "loss": 1.655, "step": 334000 }, { "epoch": 1.7206347573364882, "grad_norm": 2.189422845840454, "learning_rate": 6.723525252525253e-05, "loss": 1.6492, "step": 334500 }, { "epoch": 1.7232067076464084, "grad_norm": 2.491847038269043, "learning_rate": 6.718474747474748e-05, "loss": 1.6372, "step": 335000 }, { "epoch": 1.7257786579563281, "grad_norm": 3.009021759033203, "learning_rate": 6.713424242424244e-05, "loss": 1.6486, "step": 335500 }, { "epoch": 1.7283506082662483, "grad_norm": 2.40120005607605, "learning_rate": 6.708373737373738e-05, "loss": 1.651, "step": 336000 }, { "epoch": 1.7309225585761683, "grad_norm": 2.661926746368408, "learning_rate": 6.703323232323233e-05, "loss": 1.663, "step": 336500 }, { "epoch": 1.7334945088860882, "grad_norm": 2.7393829822540283, "learning_rate": 6.698272727272727e-05, "loss": 1.6435, "step": 337000 }, { "epoch": 1.7360664591960084, "grad_norm": 2.4835827350616455, "learning_rate": 6.693222222222223e-05, "loss": 1.6592, "step": 337500 }, { "epoch": 1.7386384095059282, "grad_norm": 2.1766092777252197, "learning_rate": 6.68818181818182e-05, "loss": 1.6624, "step": 338000 }, { "epoch": 1.7412103598158484, "grad_norm": 2.023101329803467, "learning_rate": 6.683131313131314e-05, "loss": 1.6464, "step": 338500 }, { "epoch": 1.7437823101257683, "grad_norm": 2.04542875289917, "learning_rate": 6.678080808080809e-05, "loss": 1.6598, "step": 339000 }, { "epoch": 1.7463542604356883, "grad_norm": 2.204482078552246, "learning_rate": 6.673030303030303e-05, "loss": 1.6412, "step": 339500 }, { "epoch": 1.7489262107456085, "grad_norm": 2.304865598678589, "learning_rate": 6.667979797979799e-05, "loss": 1.6596, "step": 340000 }, { "epoch": 1.7514981610555282, "grad_norm": 2.291093349456787, "learning_rate": 6.662929292929293e-05, "loss": 1.6641, "step": 340500 }, { "epoch": 1.7540701113654484, "grad_norm": 2.821134328842163, "learning_rate": 6.657878787878789e-05, "loss": 1.6525, "step": 341000 }, { "epoch": 1.7566420616753684, "grad_norm": 2.6450328826904297, "learning_rate": 6.652838383838384e-05, "loss": 1.6559, "step": 341500 }, { "epoch": 1.7592140119852884, "grad_norm": 2.166497230529785, "learning_rate": 6.647787878787879e-05, "loss": 1.6591, "step": 342000 }, { "epoch": 1.7617859622952086, "grad_norm": 2.3948822021484375, "learning_rate": 6.642737373737374e-05, "loss": 1.6536, "step": 342500 }, { "epoch": 1.7643579126051283, "grad_norm": 2.443253517150879, "learning_rate": 6.637686868686868e-05, "loss": 1.6489, "step": 343000 }, { "epoch": 1.7669298629150485, "grad_norm": 2.701960802078247, "learning_rate": 6.632636363636364e-05, "loss": 1.6575, "step": 343500 }, { "epoch": 1.7695018132249685, "grad_norm": 2.5581912994384766, "learning_rate": 6.627585858585859e-05, "loss": 1.6558, "step": 344000 }, { "epoch": 1.7720737635348884, "grad_norm": 2.5111706256866455, "learning_rate": 6.622535353535354e-05, "loss": 1.6555, "step": 344500 }, { "epoch": 1.7746457138448086, "grad_norm": 2.4795475006103516, "learning_rate": 6.617484848484848e-05, "loss": 1.6484, "step": 345000 }, { "epoch": 1.7772176641547284, "grad_norm": 2.4566597938537598, "learning_rate": 6.612444444444444e-05, "loss": 1.6532, "step": 345500 }, { "epoch": 1.7797896144646486, "grad_norm": 2.694000005722046, "learning_rate": 6.60739393939394e-05, "loss": 1.6398, "step": 346000 }, { "epoch": 1.7823615647745685, "grad_norm": 2.3903775215148926, "learning_rate": 6.602343434343435e-05, "loss": 1.66, "step": 346500 }, { "epoch": 1.7849335150844885, "grad_norm": 2.5123212337493896, "learning_rate": 6.59729292929293e-05, "loss": 1.6551, "step": 347000 }, { "epoch": 1.7875054653944087, "grad_norm": 2.346447467803955, "learning_rate": 6.592242424242424e-05, "loss": 1.664, "step": 347500 }, { "epoch": 1.7900774157043284, "grad_norm": 2.535243034362793, "learning_rate": 6.58719191919192e-05, "loss": 1.6504, "step": 348000 }, { "epoch": 1.7926493660142486, "grad_norm": 2.1878671646118164, "learning_rate": 6.582141414141413e-05, "loss": 1.6465, "step": 348500 }, { "epoch": 1.7952213163241686, "grad_norm": 1.969903826713562, "learning_rate": 6.57709090909091e-05, "loss": 1.6593, "step": 349000 }, { "epoch": 1.7977932666340886, "grad_norm": 2.7635295391082764, "learning_rate": 6.572050505050505e-05, "loss": 1.6487, "step": 349500 }, { "epoch": 1.8003652169440088, "grad_norm": 2.6183090209960938, "learning_rate": 6.567010101010101e-05, "loss": 1.6512, "step": 350000 }, { "epoch": 1.8029371672539285, "grad_norm": 2.6972358226776123, "learning_rate": 6.561959595959596e-05, "loss": 1.6441, "step": 350500 }, { "epoch": 1.8055091175638487, "grad_norm": 2.986240863800049, "learning_rate": 6.556909090909092e-05, "loss": 1.6467, "step": 351000 }, { "epoch": 1.8080810678737687, "grad_norm": 2.5499420166015625, "learning_rate": 6.551858585858585e-05, "loss": 1.6356, "step": 351500 }, { "epoch": 1.8106530181836886, "grad_norm": 2.5218753814697266, "learning_rate": 6.546808080808081e-05, "loss": 1.6519, "step": 352000 }, { "epoch": 1.8132249684936088, "grad_norm": 2.1634602546691895, "learning_rate": 6.541767676767677e-05, "loss": 1.6409, "step": 352500 }, { "epoch": 1.8157969188035286, "grad_norm": 1.9278182983398438, "learning_rate": 6.536717171717172e-05, "loss": 1.6321, "step": 353000 }, { "epoch": 1.8183688691134487, "grad_norm": 2.819406509399414, "learning_rate": 6.531666666666666e-05, "loss": 1.6444, "step": 353500 }, { "epoch": 1.8209408194233687, "grad_norm": 2.276034116744995, "learning_rate": 6.526616161616161e-05, "loss": 1.6417, "step": 354000 }, { "epoch": 1.8235127697332887, "grad_norm": 1.9764829874038696, "learning_rate": 6.521565656565657e-05, "loss": 1.6306, "step": 354500 }, { "epoch": 1.8260847200432089, "grad_norm": 1.9372199773788452, "learning_rate": 6.516515151515152e-05, "loss": 1.6447, "step": 355000 }, { "epoch": 1.8286566703531286, "grad_norm": 2.0721209049224854, "learning_rate": 6.511464646464646e-05, "loss": 1.6436, "step": 355500 }, { "epoch": 1.8312286206630488, "grad_norm": 2.5440256595611572, "learning_rate": 6.506414141414142e-05, "loss": 1.652, "step": 356000 }, { "epoch": 1.8338005709729688, "grad_norm": 2.4953465461730957, "learning_rate": 6.501373737373738e-05, "loss": 1.6449, "step": 356500 }, { "epoch": 1.8363725212828887, "grad_norm": 2.357142686843872, "learning_rate": 6.496323232323233e-05, "loss": 1.6506, "step": 357000 }, { "epoch": 1.838944471592809, "grad_norm": 2.122255325317383, "learning_rate": 6.491272727272728e-05, "loss": 1.6472, "step": 357500 }, { "epoch": 1.8415164219027287, "grad_norm": 2.392409324645996, "learning_rate": 6.486222222222222e-05, "loss": 1.6321, "step": 358000 }, { "epoch": 1.8440883722126489, "grad_norm": 2.4341251850128174, "learning_rate": 6.481181818181818e-05, "loss": 1.6351, "step": 358500 }, { "epoch": 1.8466603225225688, "grad_norm": 2.6125593185424805, "learning_rate": 6.476131313131314e-05, "loss": 1.632, "step": 359000 }, { "epoch": 1.8492322728324888, "grad_norm": 2.6240487098693848, "learning_rate": 6.471090909090909e-05, "loss": 1.6358, "step": 359500 }, { "epoch": 1.851804223142409, "grad_norm": 2.084984540939331, "learning_rate": 6.466050505050505e-05, "loss": 1.6346, "step": 360000 }, { "epoch": 1.8543761734523287, "grad_norm": 2.0900211334228516, "learning_rate": 6.461e-05, "loss": 1.642, "step": 360500 }, { "epoch": 1.856948123762249, "grad_norm": 2.4863033294677734, "learning_rate": 6.455949494949495e-05, "loss": 1.6405, "step": 361000 }, { "epoch": 1.859520074072169, "grad_norm": 2.3600735664367676, "learning_rate": 6.45089898989899e-05, "loss": 1.6417, "step": 361500 }, { "epoch": 1.8620920243820889, "grad_norm": 2.359057664871216, "learning_rate": 6.445848484848486e-05, "loss": 1.6421, "step": 362000 }, { "epoch": 1.864663974692009, "grad_norm": 2.2243077754974365, "learning_rate": 6.44079797979798e-05, "loss": 1.6285, "step": 362500 }, { "epoch": 1.8672359250019288, "grad_norm": 2.724112033843994, "learning_rate": 6.435747474747475e-05, "loss": 1.6185, "step": 363000 }, { "epoch": 1.869807875311849, "grad_norm": 2.4706525802612305, "learning_rate": 6.43069696969697e-05, "loss": 1.6416, "step": 363500 }, { "epoch": 1.872379825621769, "grad_norm": 2.599776268005371, "learning_rate": 6.425646464646466e-05, "loss": 1.6355, "step": 364000 }, { "epoch": 1.874951775931689, "grad_norm": 2.1543681621551514, "learning_rate": 6.420595959595959e-05, "loss": 1.6432, "step": 364500 }, { "epoch": 1.8775237262416091, "grad_norm": 2.042337417602539, "learning_rate": 6.415545454545455e-05, "loss": 1.6415, "step": 365000 }, { "epoch": 1.8800956765515289, "grad_norm": 2.3360307216644287, "learning_rate": 6.41049494949495e-05, "loss": 1.6353, "step": 365500 }, { "epoch": 1.882667626861449, "grad_norm": 2.5931334495544434, "learning_rate": 6.405454545454546e-05, "loss": 1.6297, "step": 366000 }, { "epoch": 1.885239577171369, "grad_norm": 2.690889835357666, "learning_rate": 6.400404040404042e-05, "loss": 1.6346, "step": 366500 }, { "epoch": 1.887811527481289, "grad_norm": 2.677400827407837, "learning_rate": 6.395353535353535e-05, "loss": 1.6375, "step": 367000 }, { "epoch": 1.8903834777912092, "grad_norm": 2.1778125762939453, "learning_rate": 6.390303030303031e-05, "loss": 1.6205, "step": 367500 }, { "epoch": 1.892955428101129, "grad_norm": 2.61460280418396, "learning_rate": 6.385262626262627e-05, "loss": 1.635, "step": 368000 }, { "epoch": 1.8955273784110491, "grad_norm": 2.425158739089966, "learning_rate": 6.380212121212122e-05, "loss": 1.628, "step": 368500 }, { "epoch": 1.898099328720969, "grad_norm": 2.5733518600463867, "learning_rate": 6.375161616161616e-05, "loss": 1.6284, "step": 369000 }, { "epoch": 1.900671279030889, "grad_norm": 2.4769554138183594, "learning_rate": 6.370111111111111e-05, "loss": 1.6334, "step": 369500 }, { "epoch": 1.9032432293408093, "grad_norm": 2.93058180809021, "learning_rate": 6.365060606060607e-05, "loss": 1.6353, "step": 370000 }, { "epoch": 1.905815179650729, "grad_norm": 2.4658243656158447, "learning_rate": 6.360010101010101e-05, "loss": 1.6291, "step": 370500 }, { "epoch": 1.9083871299606492, "grad_norm": 2.2507095336914062, "learning_rate": 6.354959595959596e-05, "loss": 1.6405, "step": 371000 }, { "epoch": 1.9109590802705692, "grad_norm": 2.3738880157470703, "learning_rate": 6.349909090909091e-05, "loss": 1.6309, "step": 371500 }, { "epoch": 1.9135310305804891, "grad_norm": 2.008300304412842, "learning_rate": 6.344858585858587e-05, "loss": 1.6273, "step": 372000 }, { "epoch": 1.9161029808904093, "grad_norm": 2.2649285793304443, "learning_rate": 6.339818181818183e-05, "loss": 1.6134, "step": 372500 }, { "epoch": 1.918674931200329, "grad_norm": 2.506477117538452, "learning_rate": 6.334767676767677e-05, "loss": 1.6283, "step": 373000 }, { "epoch": 1.9212468815102492, "grad_norm": 2.661729335784912, "learning_rate": 6.329717171717172e-05, "loss": 1.6246, "step": 373500 }, { "epoch": 1.9238188318201692, "grad_norm": 2.6854159832000732, "learning_rate": 6.324666666666667e-05, "loss": 1.6274, "step": 374000 }, { "epoch": 1.9263907821300892, "grad_norm": 2.402884006500244, "learning_rate": 6.319616161616163e-05, "loss": 1.6208, "step": 374500 }, { "epoch": 1.9289627324400094, "grad_norm": 2.1268699169158936, "learning_rate": 6.314565656565656e-05, "loss": 1.6306, "step": 375000 }, { "epoch": 1.9315346827499291, "grad_norm": 2.4067907333374023, "learning_rate": 6.309525252525252e-05, "loss": 1.6071, "step": 375500 }, { "epoch": 1.9341066330598493, "grad_norm": 2.2865099906921387, "learning_rate": 6.304474747474748e-05, "loss": 1.623, "step": 376000 }, { "epoch": 1.9366785833697693, "grad_norm": 2.0596396923065186, "learning_rate": 6.299424242424242e-05, "loss": 1.6288, "step": 376500 }, { "epoch": 1.9392505336796892, "grad_norm": 3.2876358032226562, "learning_rate": 6.294373737373738e-05, "loss": 1.635, "step": 377000 }, { "epoch": 1.9418224839896094, "grad_norm": 2.7481908798217773, "learning_rate": 6.289333333333334e-05, "loss": 1.616, "step": 377500 }, { "epoch": 1.9443944342995292, "grad_norm": 2.604656457901001, "learning_rate": 6.284282828282828e-05, "loss": 1.6255, "step": 378000 }, { "epoch": 1.9469663846094494, "grad_norm": 2.7096235752105713, "learning_rate": 6.279232323232324e-05, "loss": 1.627, "step": 378500 }, { "epoch": 1.9495383349193693, "grad_norm": 2.6425135135650635, "learning_rate": 6.274181818181818e-05, "loss": 1.6308, "step": 379000 }, { "epoch": 1.9521102852292893, "grad_norm": 2.2761101722717285, "learning_rate": 6.269141414141414e-05, "loss": 1.6328, "step": 379500 }, { "epoch": 1.9546822355392095, "grad_norm": 2.5872933864593506, "learning_rate": 6.264090909090909e-05, "loss": 1.6326, "step": 380000 }, { "epoch": 1.9572541858491292, "grad_norm": 2.401745319366455, "learning_rate": 6.259040404040403e-05, "loss": 1.6179, "step": 380500 }, { "epoch": 1.9598261361590494, "grad_norm": 2.335178852081299, "learning_rate": 6.2539898989899e-05, "loss": 1.6225, "step": 381000 }, { "epoch": 1.9623980864689694, "grad_norm": 2.1984500885009766, "learning_rate": 6.248939393939394e-05, "loss": 1.6226, "step": 381500 }, { "epoch": 1.9649700367788894, "grad_norm": 2.53519606590271, "learning_rate": 6.243888888888889e-05, "loss": 1.624, "step": 382000 }, { "epoch": 1.9675419870888096, "grad_norm": 2.1146388053894043, "learning_rate": 6.238838383838385e-05, "loss": 1.6336, "step": 382500 }, { "epoch": 1.9701139373987293, "grad_norm": 2.4738714694976807, "learning_rate": 6.23378787878788e-05, "loss": 1.637, "step": 383000 }, { "epoch": 1.9726858877086495, "grad_norm": 2.67535138130188, "learning_rate": 6.228737373737374e-05, "loss": 1.6248, "step": 383500 }, { "epoch": 1.9752578380185695, "grad_norm": 2.1487460136413574, "learning_rate": 6.223686868686869e-05, "loss": 1.6279, "step": 384000 }, { "epoch": 1.9778297883284894, "grad_norm": 2.0736780166625977, "learning_rate": 6.218636363636365e-05, "loss": 1.6211, "step": 384500 }, { "epoch": 1.9804017386384096, "grad_norm": 2.4359467029571533, "learning_rate": 6.21359595959596e-05, "loss": 1.6341, "step": 385000 }, { "epoch": 1.9829736889483294, "grad_norm": 2.57645845413208, "learning_rate": 6.208545454545455e-05, "loss": 1.6326, "step": 385500 }, { "epoch": 1.9855456392582496, "grad_norm": 2.375304698944092, "learning_rate": 6.20349494949495e-05, "loss": 1.6338, "step": 386000 }, { "epoch": 1.9881175895681695, "grad_norm": 2.1585114002227783, "learning_rate": 6.198444444444444e-05, "loss": 1.6152, "step": 386500 }, { "epoch": 1.9906895398780895, "grad_norm": 2.393204689025879, "learning_rate": 6.19339393939394e-05, "loss": 1.6081, "step": 387000 }, { "epoch": 1.9932614901880097, "grad_norm": 2.543041706085205, "learning_rate": 6.188343434343434e-05, "loss": 1.6122, "step": 387500 }, { "epoch": 1.9958334404979294, "grad_norm": 2.2563304901123047, "learning_rate": 6.18329292929293e-05, "loss": 1.6172, "step": 388000 }, { "epoch": 1.9984053908078496, "grad_norm": 2.4522125720977783, "learning_rate": 6.178242424242424e-05, "loss": 1.6129, "step": 388500 }, { "epoch": 2.00097734111777, "grad_norm": 2.579383611679077, "learning_rate": 6.17320202020202e-05, "loss": 1.6145, "step": 389000 }, { "epoch": 2.0035492914276896, "grad_norm": 2.0245561599731445, "learning_rate": 6.168161616161616e-05, "loss": 1.6293, "step": 389500 }, { "epoch": 2.0061212417376098, "grad_norm": 2.2552874088287354, "learning_rate": 6.163111111111112e-05, "loss": 1.606, "step": 390000 }, { "epoch": 2.0086931920475295, "grad_norm": 2.6959872245788574, "learning_rate": 6.158060606060606e-05, "loss": 1.6271, "step": 390500 }, { "epoch": 2.0112651423574497, "grad_norm": 2.65429949760437, "learning_rate": 6.153010101010102e-05, "loss": 1.6144, "step": 391000 }, { "epoch": 2.01383709266737, "grad_norm": 2.2554690837860107, "learning_rate": 6.147959595959596e-05, "loss": 1.614, "step": 391500 }, { "epoch": 2.0164090429772896, "grad_norm": 2.3167171478271484, "learning_rate": 6.142909090909091e-05, "loss": 1.6128, "step": 392000 }, { "epoch": 2.01898099328721, "grad_norm": 2.5956623554229736, "learning_rate": 6.137858585858585e-05, "loss": 1.6034, "step": 392500 }, { "epoch": 2.0215529435971296, "grad_norm": 2.968029260635376, "learning_rate": 6.132808080808081e-05, "loss": 1.6141, "step": 393000 }, { "epoch": 2.0241248939070497, "grad_norm": 2.7544617652893066, "learning_rate": 6.127767676767677e-05, "loss": 1.6214, "step": 393500 }, { "epoch": 2.02669684421697, "grad_norm": 2.2742416858673096, "learning_rate": 6.122717171717172e-05, "loss": 1.6148, "step": 394000 }, { "epoch": 2.0292687945268897, "grad_norm": 2.220961809158325, "learning_rate": 6.117666666666667e-05, "loss": 1.619, "step": 394500 }, { "epoch": 2.03184074483681, "grad_norm": 2.195733070373535, "learning_rate": 6.112616161616161e-05, "loss": 1.616, "step": 395000 }, { "epoch": 2.0344126951467296, "grad_norm": 2.3462278842926025, "learning_rate": 6.107575757575757e-05, "loss": 1.6129, "step": 395500 }, { "epoch": 2.03698464545665, "grad_norm": 2.70003604888916, "learning_rate": 6.102525252525253e-05, "loss": 1.6043, "step": 396000 }, { "epoch": 2.03955659576657, "grad_norm": 2.403668165206909, "learning_rate": 6.097474747474747e-05, "loss": 1.6184, "step": 396500 }, { "epoch": 2.0421285460764897, "grad_norm": 2.6988089084625244, "learning_rate": 6.0924242424242425e-05, "loss": 1.5978, "step": 397000 }, { "epoch": 2.04470049638641, "grad_norm": 2.7455625534057617, "learning_rate": 6.087383838383839e-05, "loss": 1.6167, "step": 397500 }, { "epoch": 2.0472724466963297, "grad_norm": 2.071835994720459, "learning_rate": 6.082343434343435e-05, "loss": 1.6044, "step": 398000 }, { "epoch": 2.04984439700625, "grad_norm": 2.2983603477478027, "learning_rate": 6.077303030303031e-05, "loss": 1.6122, "step": 398500 }, { "epoch": 2.05241634731617, "grad_norm": 2.077721118927002, "learning_rate": 6.072252525252525e-05, "loss": 1.6174, "step": 399000 }, { "epoch": 2.05498829762609, "grad_norm": 2.942838430404663, "learning_rate": 6.0672121212121216e-05, "loss": 1.6065, "step": 399500 }, { "epoch": 2.05756024793601, "grad_norm": 2.2567286491394043, "learning_rate": 6.062161616161617e-05, "loss": 1.5962, "step": 400000 }, { "epoch": 2.0601321982459297, "grad_norm": 2.995159149169922, "learning_rate": 6.057111111111111e-05, "loss": 1.5997, "step": 400500 }, { "epoch": 2.06270414855585, "grad_norm": 2.48285174369812, "learning_rate": 6.052060606060607e-05, "loss": 1.6092, "step": 401000 }, { "epoch": 2.06527609886577, "grad_norm": 2.27602481842041, "learning_rate": 6.047010101010101e-05, "loss": 1.6085, "step": 401500 }, { "epoch": 2.06784804917569, "grad_norm": 2.100888252258301, "learning_rate": 6.041959595959596e-05, "loss": 1.6074, "step": 402000 }, { "epoch": 2.07041999948561, "grad_norm": 2.656245708465576, "learning_rate": 6.036909090909091e-05, "loss": 1.6188, "step": 402500 }, { "epoch": 2.07299194979553, "grad_norm": 2.497401237487793, "learning_rate": 6.031858585858586e-05, "loss": 1.5999, "step": 403000 }, { "epoch": 2.07556390010545, "grad_norm": 2.740108013153076, "learning_rate": 6.026808080808081e-05, "loss": 1.6138, "step": 403500 }, { "epoch": 2.07813585041537, "grad_norm": 2.2161812782287598, "learning_rate": 6.021757575757576e-05, "loss": 1.5904, "step": 404000 }, { "epoch": 2.08070780072529, "grad_norm": 2.5596768856048584, "learning_rate": 6.016707070707071e-05, "loss": 1.5991, "step": 404500 }, { "epoch": 2.08327975103521, "grad_norm": 2.474024772644043, "learning_rate": 6.011656565656566e-05, "loss": 1.6169, "step": 405000 }, { "epoch": 2.08585170134513, "grad_norm": 2.562389373779297, "learning_rate": 6.0066060606060606e-05, "loss": 1.5961, "step": 405500 }, { "epoch": 2.08842365165505, "grad_norm": 2.165395498275757, "learning_rate": 6.001555555555556e-05, "loss": 1.6032, "step": 406000 }, { "epoch": 2.09099560196497, "grad_norm": 2.6308302879333496, "learning_rate": 5.996505050505051e-05, "loss": 1.6094, "step": 406500 }, { "epoch": 2.09356755227489, "grad_norm": 2.065725564956665, "learning_rate": 5.9914646464646465e-05, "loss": 1.6033, "step": 407000 }, { "epoch": 2.09613950258481, "grad_norm": 3.004451274871826, "learning_rate": 5.986414141414142e-05, "loss": 1.6011, "step": 407500 }, { "epoch": 2.09871145289473, "grad_norm": 2.6577351093292236, "learning_rate": 5.981363636363637e-05, "loss": 1.5985, "step": 408000 }, { "epoch": 2.10128340320465, "grad_norm": 2.4974942207336426, "learning_rate": 5.976313131313132e-05, "loss": 1.6029, "step": 408500 }, { "epoch": 2.10385535351457, "grad_norm": 2.7885189056396484, "learning_rate": 5.971262626262627e-05, "loss": 1.6035, "step": 409000 }, { "epoch": 2.10642730382449, "grad_norm": 2.323251485824585, "learning_rate": 5.966212121212121e-05, "loss": 1.6054, "step": 409500 }, { "epoch": 2.1089992541344103, "grad_norm": 2.666215658187866, "learning_rate": 5.961171717171718e-05, "loss": 1.5987, "step": 410000 }, { "epoch": 2.11157120444433, "grad_norm": 2.597623586654663, "learning_rate": 5.956121212121213e-05, "loss": 1.606, "step": 410500 }, { "epoch": 2.11414315475425, "grad_norm": 1.9947013854980469, "learning_rate": 5.951070707070707e-05, "loss": 1.5954, "step": 411000 }, { "epoch": 2.11671510506417, "grad_norm": 2.544792652130127, "learning_rate": 5.946020202020203e-05, "loss": 1.5898, "step": 411500 }, { "epoch": 2.11928705537409, "grad_norm": 2.5514931678771973, "learning_rate": 5.940979797979799e-05, "loss": 1.5974, "step": 412000 }, { "epoch": 2.1218590056840103, "grad_norm": 2.448437213897705, "learning_rate": 5.935929292929293e-05, "loss": 1.6039, "step": 412500 }, { "epoch": 2.12443095599393, "grad_norm": 2.7591707706451416, "learning_rate": 5.930878787878789e-05, "loss": 1.604, "step": 413000 }, { "epoch": 2.1270029063038502, "grad_norm": 2.3299643993377686, "learning_rate": 5.925828282828283e-05, "loss": 1.6002, "step": 413500 }, { "epoch": 2.12957485661377, "grad_norm": 2.2050669193267822, "learning_rate": 5.920777777777778e-05, "loss": 1.6087, "step": 414000 }, { "epoch": 2.13214680692369, "grad_norm": 2.514944314956665, "learning_rate": 5.915727272727273e-05, "loss": 1.5965, "step": 414500 }, { "epoch": 2.1347187572336104, "grad_norm": 2.3953447341918945, "learning_rate": 5.910676767676768e-05, "loss": 1.598, "step": 415000 }, { "epoch": 2.13729070754353, "grad_norm": 2.2718632221221924, "learning_rate": 5.905626262626263e-05, "loss": 1.5952, "step": 415500 }, { "epoch": 2.1398626578534503, "grad_norm": 2.559480905532837, "learning_rate": 5.900585858585859e-05, "loss": 1.5933, "step": 416000 }, { "epoch": 2.14243460816337, "grad_norm": 2.7121787071228027, "learning_rate": 5.895535353535354e-05, "loss": 1.6007, "step": 416500 }, { "epoch": 2.1450065584732902, "grad_norm": 3.180011510848999, "learning_rate": 5.8904848484848486e-05, "loss": 1.5951, "step": 417000 }, { "epoch": 2.1475785087832104, "grad_norm": 3.01538348197937, "learning_rate": 5.885434343434344e-05, "loss": 1.6045, "step": 417500 }, { "epoch": 2.15015045909313, "grad_norm": 2.491154670715332, "learning_rate": 5.8803838383838386e-05, "loss": 1.6071, "step": 418000 }, { "epoch": 2.1527224094030504, "grad_norm": 2.4242184162139893, "learning_rate": 5.875333333333334e-05, "loss": 1.5957, "step": 418500 }, { "epoch": 2.15529435971297, "grad_norm": 2.3193559646606445, "learning_rate": 5.87029292929293e-05, "loss": 1.6033, "step": 419000 }, { "epoch": 2.1578663100228903, "grad_norm": 2.1788597106933594, "learning_rate": 5.8652424242424245e-05, "loss": 1.5927, "step": 419500 }, { "epoch": 2.1604382603328105, "grad_norm": 2.646376371383667, "learning_rate": 5.86019191919192e-05, "loss": 1.5895, "step": 420000 }, { "epoch": 2.1630102106427302, "grad_norm": 2.4380106925964355, "learning_rate": 5.855141414141414e-05, "loss": 1.5864, "step": 420500 }, { "epoch": 2.1655821609526504, "grad_norm": 2.479421377182007, "learning_rate": 5.85009090909091e-05, "loss": 1.5964, "step": 421000 }, { "epoch": 2.16815411126257, "grad_norm": 2.3349339962005615, "learning_rate": 5.845040404040404e-05, "loss": 1.597, "step": 421500 }, { "epoch": 2.1707260615724904, "grad_norm": 2.4106128215789795, "learning_rate": 5.8399999999999997e-05, "loss": 1.5902, "step": 422000 }, { "epoch": 2.1732980118824106, "grad_norm": 2.562054395675659, "learning_rate": 5.834959595959596e-05, "loss": 1.5963, "step": 422500 }, { "epoch": 2.1758699621923303, "grad_norm": 2.206015110015869, "learning_rate": 5.8299090909090916e-05, "loss": 1.5993, "step": 423000 }, { "epoch": 2.1784419125022505, "grad_norm": 2.554619550704956, "learning_rate": 5.8248585858585856e-05, "loss": 1.6091, "step": 423500 }, { "epoch": 2.1810138628121702, "grad_norm": 2.2453134059906006, "learning_rate": 5.8198080808080815e-05, "loss": 1.5852, "step": 424000 }, { "epoch": 2.1835858131220904, "grad_norm": 2.3707222938537598, "learning_rate": 5.8147575757575755e-05, "loss": 1.587, "step": 424500 }, { "epoch": 2.1861577634320106, "grad_norm": 2.2257208824157715, "learning_rate": 5.809707070707071e-05, "loss": 1.5821, "step": 425000 }, { "epoch": 2.1887297137419304, "grad_norm": 2.582345724105835, "learning_rate": 5.8046666666666674e-05, "loss": 1.5937, "step": 425500 }, { "epoch": 2.1913016640518506, "grad_norm": 2.2276124954223633, "learning_rate": 5.7996161616161614e-05, "loss": 1.5982, "step": 426000 }, { "epoch": 2.1938736143617703, "grad_norm": 2.5953102111816406, "learning_rate": 5.794565656565657e-05, "loss": 1.6, "step": 426500 }, { "epoch": 2.1964455646716905, "grad_norm": 2.059342861175537, "learning_rate": 5.7895151515151514e-05, "loss": 1.6002, "step": 427000 }, { "epoch": 2.1990175149816107, "grad_norm": 2.5329113006591797, "learning_rate": 5.784464646464647e-05, "loss": 1.5795, "step": 427500 }, { "epoch": 2.2015894652915304, "grad_norm": 2.672567844390869, "learning_rate": 5.779414141414141e-05, "loss": 1.5899, "step": 428000 }, { "epoch": 2.2041614156014506, "grad_norm": 2.0910274982452393, "learning_rate": 5.7743636363636366e-05, "loss": 1.5848, "step": 428500 }, { "epoch": 2.2067333659113704, "grad_norm": 2.369044542312622, "learning_rate": 5.769313131313132e-05, "loss": 1.5936, "step": 429000 }, { "epoch": 2.2093053162212906, "grad_norm": 2.7465758323669434, "learning_rate": 5.7642626262626266e-05, "loss": 1.5933, "step": 429500 }, { "epoch": 2.2118772665312108, "grad_norm": 2.3471922874450684, "learning_rate": 5.7592222222222225e-05, "loss": 1.5846, "step": 430000 }, { "epoch": 2.2144492168411305, "grad_norm": 2.5954208374023438, "learning_rate": 5.754171717171718e-05, "loss": 1.5892, "step": 430500 }, { "epoch": 2.2170211671510507, "grad_norm": 2.122445583343506, "learning_rate": 5.7491212121212125e-05, "loss": 1.5951, "step": 431000 }, { "epoch": 2.2195931174609704, "grad_norm": 2.378053665161133, "learning_rate": 5.744070707070708e-05, "loss": 1.595, "step": 431500 }, { "epoch": 2.2221650677708906, "grad_norm": 3.016186475753784, "learning_rate": 5.7390202020202024e-05, "loss": 1.5805, "step": 432000 }, { "epoch": 2.224737018080811, "grad_norm": 2.2016313076019287, "learning_rate": 5.7339797979797984e-05, "loss": 1.5976, "step": 432500 }, { "epoch": 2.2273089683907306, "grad_norm": 2.296274423599243, "learning_rate": 5.728929292929294e-05, "loss": 1.5844, "step": 433000 }, { "epoch": 2.2298809187006507, "grad_norm": 2.5509867668151855, "learning_rate": 5.723878787878788e-05, "loss": 1.5907, "step": 433500 }, { "epoch": 2.2324528690105705, "grad_norm": 2.5408694744110107, "learning_rate": 5.7188282828282836e-05, "loss": 1.6015, "step": 434000 }, { "epoch": 2.2350248193204907, "grad_norm": 2.5384156703948975, "learning_rate": 5.7137878787878796e-05, "loss": 1.602, "step": 434500 }, { "epoch": 2.237596769630411, "grad_norm": 2.3616080284118652, "learning_rate": 5.708737373737374e-05, "loss": 1.5998, "step": 435000 }, { "epoch": 2.2401687199403306, "grad_norm": 2.7889325618743896, "learning_rate": 5.7036868686868695e-05, "loss": 1.5842, "step": 435500 }, { "epoch": 2.242740670250251, "grad_norm": 2.3167500495910645, "learning_rate": 5.6986363636363635e-05, "loss": 1.5897, "step": 436000 }, { "epoch": 2.2453126205601706, "grad_norm": 2.556781053543091, "learning_rate": 5.6935858585858595e-05, "loss": 1.5807, "step": 436500 }, { "epoch": 2.2478845708700907, "grad_norm": 2.290909767150879, "learning_rate": 5.6885353535353534e-05, "loss": 1.5765, "step": 437000 }, { "epoch": 2.250456521180011, "grad_norm": 2.239105224609375, "learning_rate": 5.6834949494949494e-05, "loss": 1.5916, "step": 437500 }, { "epoch": 2.2530284714899307, "grad_norm": 2.7574894428253174, "learning_rate": 5.6784444444444454e-05, "loss": 1.5915, "step": 438000 }, { "epoch": 2.255600421799851, "grad_norm": 2.2202274799346924, "learning_rate": 5.673393939393939e-05, "loss": 1.5921, "step": 438500 }, { "epoch": 2.2581723721097706, "grad_norm": 2.6853768825531006, "learning_rate": 5.6683434343434346e-05, "loss": 1.5815, "step": 439000 }, { "epoch": 2.260744322419691, "grad_norm": 2.2511544227600098, "learning_rate": 5.663292929292929e-05, "loss": 1.5858, "step": 439500 }, { "epoch": 2.263316272729611, "grad_norm": 2.5201659202575684, "learning_rate": 5.6582424242424246e-05, "loss": 1.577, "step": 440000 }, { "epoch": 2.2658882230395307, "grad_norm": 2.3538320064544678, "learning_rate": 5.6532020202020206e-05, "loss": 1.5781, "step": 440500 }, { "epoch": 2.268460173349451, "grad_norm": 2.352900981903076, "learning_rate": 5.648151515151515e-05, "loss": 1.5677, "step": 441000 }, { "epoch": 2.2710321236593707, "grad_norm": 2.8098092079162598, "learning_rate": 5.6431010101010105e-05, "loss": 1.5754, "step": 441500 }, { "epoch": 2.273604073969291, "grad_norm": 2.5628156661987305, "learning_rate": 5.638050505050505e-05, "loss": 1.5882, "step": 442000 }, { "epoch": 2.276176024279211, "grad_norm": 2.2846975326538086, "learning_rate": 5.633010101010101e-05, "loss": 1.5868, "step": 442500 }, { "epoch": 2.278747974589131, "grad_norm": 2.268409013748169, "learning_rate": 5.6279595959595964e-05, "loss": 1.5823, "step": 443000 }, { "epoch": 2.281319924899051, "grad_norm": 2.092773914337158, "learning_rate": 5.622909090909091e-05, "loss": 1.5884, "step": 443500 }, { "epoch": 2.2838918752089707, "grad_norm": 2.2289109230041504, "learning_rate": 5.6178585858585863e-05, "loss": 1.5933, "step": 444000 }, { "epoch": 2.286463825518891, "grad_norm": 2.1926701068878174, "learning_rate": 5.612808080808081e-05, "loss": 1.5816, "step": 444500 }, { "epoch": 2.289035775828811, "grad_norm": 2.5182721614837646, "learning_rate": 5.607757575757576e-05, "loss": 1.5973, "step": 445000 }, { "epoch": 2.291607726138731, "grad_norm": 2.6437392234802246, "learning_rate": 5.602717171717172e-05, "loss": 1.5841, "step": 445500 }, { "epoch": 2.294179676448651, "grad_norm": 2.7058298587799072, "learning_rate": 5.597666666666667e-05, "loss": 1.5831, "step": 446000 }, { "epoch": 2.296751626758571, "grad_norm": 2.0953357219696045, "learning_rate": 5.592616161616162e-05, "loss": 1.5918, "step": 446500 }, { "epoch": 2.299323577068491, "grad_norm": 2.297541618347168, "learning_rate": 5.587565656565656e-05, "loss": 1.5666, "step": 447000 }, { "epoch": 2.301895527378411, "grad_norm": 2.4610650539398193, "learning_rate": 5.582525252525253e-05, "loss": 1.5804, "step": 447500 }, { "epoch": 2.304467477688331, "grad_norm": 2.629695415496826, "learning_rate": 5.577474747474748e-05, "loss": 1.5843, "step": 448000 }, { "epoch": 2.307039427998251, "grad_norm": 2.474860906600952, "learning_rate": 5.572424242424242e-05, "loss": 1.5928, "step": 448500 }, { "epoch": 2.309611378308171, "grad_norm": 2.8906733989715576, "learning_rate": 5.567373737373738e-05, "loss": 1.5825, "step": 449000 }, { "epoch": 2.312183328618091, "grad_norm": 2.610053062438965, "learning_rate": 5.562323232323232e-05, "loss": 1.5864, "step": 449500 }, { "epoch": 2.3147552789280113, "grad_norm": 2.2027618885040283, "learning_rate": 5.557282828282828e-05, "loss": 1.5657, "step": 450000 }, { "epoch": 2.317327229237931, "grad_norm": 2.362893581390381, "learning_rate": 5.552232323232324e-05, "loss": 1.5803, "step": 450500 }, { "epoch": 2.319899179547851, "grad_norm": 3.065056800842285, "learning_rate": 5.547181818181818e-05, "loss": 1.5745, "step": 451000 }, { "epoch": 2.322471129857771, "grad_norm": 2.644787311553955, "learning_rate": 5.542131313131313e-05, "loss": 1.5805, "step": 451500 }, { "epoch": 2.325043080167691, "grad_norm": 2.324190855026245, "learning_rate": 5.537080808080808e-05, "loss": 1.5782, "step": 452000 }, { "epoch": 2.3276150304776113, "grad_norm": 2.8596031665802, "learning_rate": 5.532030303030303e-05, "loss": 1.5731, "step": 452500 }, { "epoch": 2.330186980787531, "grad_norm": 2.6860458850860596, "learning_rate": 5.526979797979798e-05, "loss": 1.5761, "step": 453000 }, { "epoch": 2.3327589310974512, "grad_norm": 2.1039023399353027, "learning_rate": 5.521929292929293e-05, "loss": 1.5773, "step": 453500 }, { "epoch": 2.335330881407371, "grad_norm": 2.399176836013794, "learning_rate": 5.516888888888889e-05, "loss": 1.5705, "step": 454000 }, { "epoch": 2.337902831717291, "grad_norm": 2.207998514175415, "learning_rate": 5.511838383838384e-05, "loss": 1.5846, "step": 454500 }, { "epoch": 2.3404747820272114, "grad_norm": 2.3117659091949463, "learning_rate": 5.50679797979798e-05, "loss": 1.5773, "step": 455000 }, { "epoch": 2.343046732337131, "grad_norm": 2.4075472354888916, "learning_rate": 5.501747474747475e-05, "loss": 1.5747, "step": 455500 }, { "epoch": 2.3456186826470513, "grad_norm": 2.715557813644409, "learning_rate": 5.4966969696969696e-05, "loss": 1.5734, "step": 456000 }, { "epoch": 2.348190632956971, "grad_norm": 2.486280679702759, "learning_rate": 5.491646464646465e-05, "loss": 1.5764, "step": 456500 }, { "epoch": 2.3507625832668912, "grad_norm": 2.090132713317871, "learning_rate": 5.486606060606061e-05, "loss": 1.5649, "step": 457000 }, { "epoch": 2.3533345335768114, "grad_norm": 3.2762579917907715, "learning_rate": 5.4815555555555555e-05, "loss": 1.5744, "step": 457500 }, { "epoch": 2.355906483886731, "grad_norm": 2.641038179397583, "learning_rate": 5.4765151515151515e-05, "loss": 1.5807, "step": 458000 }, { "epoch": 2.3584784341966514, "grad_norm": 2.685852527618408, "learning_rate": 5.471464646464647e-05, "loss": 1.5747, "step": 458500 }, { "epoch": 2.361050384506571, "grad_norm": 3.033771514892578, "learning_rate": 5.4664141414141414e-05, "loss": 1.5735, "step": 459000 }, { "epoch": 2.3636223348164913, "grad_norm": 2.240175485610962, "learning_rate": 5.461363636363637e-05, "loss": 1.5664, "step": 459500 }, { "epoch": 2.3661942851264115, "grad_norm": 2.1413381099700928, "learning_rate": 5.4563131313131314e-05, "loss": 1.5739, "step": 460000 }, { "epoch": 2.3687662354363312, "grad_norm": 2.455625295639038, "learning_rate": 5.451262626262627e-05, "loss": 1.5737, "step": 460500 }, { "epoch": 2.3713381857462514, "grad_norm": 2.3633012771606445, "learning_rate": 5.446212121212122e-05, "loss": 1.5685, "step": 461000 }, { "epoch": 2.373910136056171, "grad_norm": 2.4887959957122803, "learning_rate": 5.4411616161616166e-05, "loss": 1.5691, "step": 461500 }, { "epoch": 2.3764820863660914, "grad_norm": 2.6525588035583496, "learning_rate": 5.436111111111112e-05, "loss": 1.5663, "step": 462000 }, { "epoch": 2.3790540366760116, "grad_norm": 2.4766228199005127, "learning_rate": 5.431070707070708e-05, "loss": 1.5682, "step": 462500 }, { "epoch": 2.3816259869859313, "grad_norm": 2.230529308319092, "learning_rate": 5.4260202020202025e-05, "loss": 1.5752, "step": 463000 }, { "epoch": 2.3841979372958515, "grad_norm": 2.414194345474243, "learning_rate": 5.420969696969698e-05, "loss": 1.572, "step": 463500 }, { "epoch": 2.3867698876057712, "grad_norm": 2.442136287689209, "learning_rate": 5.415919191919192e-05, "loss": 1.5765, "step": 464000 }, { "epoch": 2.3893418379156914, "grad_norm": 2.0765578746795654, "learning_rate": 5.4108787878787884e-05, "loss": 1.5822, "step": 464500 }, { "epoch": 2.3919137882256116, "grad_norm": 2.4134793281555176, "learning_rate": 5.405828282828284e-05, "loss": 1.5652, "step": 465000 }, { "epoch": 2.3944857385355314, "grad_norm": 2.300403356552124, "learning_rate": 5.400777777777778e-05, "loss": 1.5599, "step": 465500 }, { "epoch": 2.3970576888454516, "grad_norm": 2.1540491580963135, "learning_rate": 5.395727272727274e-05, "loss": 1.5634, "step": 466000 }, { "epoch": 2.3996296391553713, "grad_norm": 2.8791420459747314, "learning_rate": 5.3906767676767676e-05, "loss": 1.5695, "step": 466500 }, { "epoch": 2.4022015894652915, "grad_norm": 2.2609245777130127, "learning_rate": 5.3856363636363636e-05, "loss": 1.5726, "step": 467000 }, { "epoch": 2.4047735397752117, "grad_norm": 2.4185187816619873, "learning_rate": 5.3805858585858596e-05, "loss": 1.5764, "step": 467500 }, { "epoch": 2.4073454900851314, "grad_norm": 2.195435047149658, "learning_rate": 5.3755353535353536e-05, "loss": 1.5616, "step": 468000 }, { "epoch": 2.4099174403950516, "grad_norm": 2.378612756729126, "learning_rate": 5.370484848484849e-05, "loss": 1.5636, "step": 468500 }, { "epoch": 2.4124893907049714, "grad_norm": 2.3817667961120605, "learning_rate": 5.3654343434343435e-05, "loss": 1.5682, "step": 469000 }, { "epoch": 2.4150613410148916, "grad_norm": 2.7806594371795654, "learning_rate": 5.360383838383839e-05, "loss": 1.5611, "step": 469500 }, { "epoch": 2.4176332913248118, "grad_norm": 2.0810320377349854, "learning_rate": 5.3553333333333334e-05, "loss": 1.5717, "step": 470000 }, { "epoch": 2.4202052416347315, "grad_norm": 2.4072470664978027, "learning_rate": 5.350282828282829e-05, "loss": 1.5594, "step": 470500 }, { "epoch": 2.4227771919446517, "grad_norm": 2.347970485687256, "learning_rate": 5.345242424242425e-05, "loss": 1.5619, "step": 471000 }, { "epoch": 2.4253491422545714, "grad_norm": 2.6435277462005615, "learning_rate": 5.3401919191919193e-05, "loss": 1.5848, "step": 471500 }, { "epoch": 2.4279210925644916, "grad_norm": 2.3187005519866943, "learning_rate": 5.335141414141415e-05, "loss": 1.565, "step": 472000 }, { "epoch": 2.430493042874412, "grad_norm": 2.5662784576416016, "learning_rate": 5.3301010101010106e-05, "loss": 1.5764, "step": 472500 }, { "epoch": 2.4330649931843316, "grad_norm": 2.5049164295196533, "learning_rate": 5.325050505050505e-05, "loss": 1.5577, "step": 473000 }, { "epoch": 2.4356369434942517, "grad_norm": 2.5086004734039307, "learning_rate": 5.3200000000000006e-05, "loss": 1.5622, "step": 473500 }, { "epoch": 2.4382088938041715, "grad_norm": 2.5472593307495117, "learning_rate": 5.3149595959595965e-05, "loss": 1.5525, "step": 474000 }, { "epoch": 2.4407808441140917, "grad_norm": 2.441056966781616, "learning_rate": 5.309909090909091e-05, "loss": 1.574, "step": 474500 }, { "epoch": 2.443352794424012, "grad_norm": 2.6029136180877686, "learning_rate": 5.3048585858585865e-05, "loss": 1.5704, "step": 475000 }, { "epoch": 2.4459247447339316, "grad_norm": 2.321699857711792, "learning_rate": 5.299808080808081e-05, "loss": 1.5549, "step": 475500 }, { "epoch": 2.448496695043852, "grad_norm": 2.694145441055298, "learning_rate": 5.2947575757575764e-05, "loss": 1.5603, "step": 476000 }, { "epoch": 2.4510686453537716, "grad_norm": 2.952949047088623, "learning_rate": 5.2897070707070704e-05, "loss": 1.5659, "step": 476500 }, { "epoch": 2.4536405956636917, "grad_norm": 2.3803412914276123, "learning_rate": 5.2846565656565664e-05, "loss": 1.5602, "step": 477000 }, { "epoch": 2.456212545973612, "grad_norm": 2.4755702018737793, "learning_rate": 5.27960606060606e-05, "loss": 1.5719, "step": 477500 }, { "epoch": 2.4587844962835317, "grad_norm": 2.4618046283721924, "learning_rate": 5.2745555555555556e-05, "loss": 1.5675, "step": 478000 }, { "epoch": 2.461356446593452, "grad_norm": 2.186459541320801, "learning_rate": 5.26950505050505e-05, "loss": 1.5638, "step": 478500 }, { "epoch": 2.4639283969033716, "grad_norm": 2.701554298400879, "learning_rate": 5.2644545454545456e-05, "loss": 1.5633, "step": 479000 }, { "epoch": 2.466500347213292, "grad_norm": 2.445854902267456, "learning_rate": 5.25940404040404e-05, "loss": 1.5589, "step": 479500 }, { "epoch": 2.469072297523212, "grad_norm": 2.387634515762329, "learning_rate": 5.254373737373738e-05, "loss": 1.5658, "step": 480000 }, { "epoch": 2.4716442478331317, "grad_norm": 2.4959769248962402, "learning_rate": 5.249323232323232e-05, "loss": 1.5637, "step": 480500 }, { "epoch": 2.474216198143052, "grad_norm": 2.722851276397705, "learning_rate": 5.2442727272727274e-05, "loss": 1.5659, "step": 481000 }, { "epoch": 2.4767881484529717, "grad_norm": 2.4769365787506104, "learning_rate": 5.239222222222222e-05, "loss": 1.5599, "step": 481500 }, { "epoch": 2.479360098762892, "grad_norm": 2.57315993309021, "learning_rate": 5.2341717171717174e-05, "loss": 1.5583, "step": 482000 }, { "epoch": 2.481932049072812, "grad_norm": 2.319643974304199, "learning_rate": 5.229121212121212e-05, "loss": 1.5598, "step": 482500 }, { "epoch": 2.484503999382732, "grad_norm": 2.470033645629883, "learning_rate": 5.224080808080808e-05, "loss": 1.5547, "step": 483000 }, { "epoch": 2.487075949692652, "grad_norm": 3.001162052154541, "learning_rate": 5.219030303030303e-05, "loss": 1.5655, "step": 483500 }, { "epoch": 2.4896479000025717, "grad_norm": 2.486762523651123, "learning_rate": 5.213979797979798e-05, "loss": 1.5738, "step": 484000 }, { "epoch": 2.492219850312492, "grad_norm": 2.6207542419433594, "learning_rate": 5.208939393939394e-05, "loss": 1.5562, "step": 484500 }, { "epoch": 2.494791800622412, "grad_norm": 2.8983652591705322, "learning_rate": 5.203888888888889e-05, "loss": 1.5562, "step": 485000 }, { "epoch": 2.497363750932332, "grad_norm": 2.157689332962036, "learning_rate": 5.198838383838384e-05, "loss": 1.561, "step": 485500 }, { "epoch": 2.499935701242252, "grad_norm": 2.469301462173462, "learning_rate": 5.193787878787879e-05, "loss": 1.5567, "step": 486000 }, { "epoch": 2.502507651552172, "grad_norm": 2.441870927810669, "learning_rate": 5.188747474747475e-05, "loss": 1.5579, "step": 486500 }, { "epoch": 2.505079601862092, "grad_norm": 2.232508897781372, "learning_rate": 5.18369696969697e-05, "loss": 1.5521, "step": 487000 }, { "epoch": 2.507651552172012, "grad_norm": 2.48417067527771, "learning_rate": 5.178646464646465e-05, "loss": 1.5602, "step": 487500 }, { "epoch": 2.510223502481932, "grad_norm": 2.3687491416931152, "learning_rate": 5.173595959595959e-05, "loss": 1.5541, "step": 488000 }, { "epoch": 2.512795452791852, "grad_norm": 2.153627395629883, "learning_rate": 5.168545454545455e-05, "loss": 1.5581, "step": 488500 }, { "epoch": 2.515367403101772, "grad_norm": 2.908628463745117, "learning_rate": 5.16349494949495e-05, "loss": 1.5644, "step": 489000 }, { "epoch": 2.517939353411692, "grad_norm": 2.5632777214050293, "learning_rate": 5.158444444444445e-05, "loss": 1.5542, "step": 489500 }, { "epoch": 2.5205113037216123, "grad_norm": 2.2820920944213867, "learning_rate": 5.15339393939394e-05, "loss": 1.5538, "step": 490000 }, { "epoch": 2.523083254031532, "grad_norm": 2.4731087684631348, "learning_rate": 5.148343434343434e-05, "loss": 1.5454, "step": 490500 }, { "epoch": 2.525655204341452, "grad_norm": 2.622070789337158, "learning_rate": 5.14329292929293e-05, "loss": 1.5595, "step": 491000 }, { "epoch": 2.528227154651372, "grad_norm": 2.20470929145813, "learning_rate": 5.138242424242424e-05, "loss": 1.5518, "step": 491500 }, { "epoch": 2.530799104961292, "grad_norm": 3.232024669647217, "learning_rate": 5.1331919191919195e-05, "loss": 1.5537, "step": 492000 }, { "epoch": 2.5333710552712123, "grad_norm": 2.674577236175537, "learning_rate": 5.128151515151516e-05, "loss": 1.5556, "step": 492500 }, { "epoch": 2.535943005581132, "grad_norm": 2.4473094940185547, "learning_rate": 5.12310101010101e-05, "loss": 1.5584, "step": 493000 }, { "epoch": 2.5385149558910522, "grad_norm": 2.435515880584717, "learning_rate": 5.1180505050505054e-05, "loss": 1.5543, "step": 493500 }, { "epoch": 2.541086906200972, "grad_norm": 2.112659454345703, "learning_rate": 5.113e-05, "loss": 1.5434, "step": 494000 }, { "epoch": 2.543658856510892, "grad_norm": 2.5637118816375732, "learning_rate": 5.107949494949495e-05, "loss": 1.5566, "step": 494500 }, { "epoch": 2.5462308068208124, "grad_norm": 2.8220012187957764, "learning_rate": 5.10289898989899e-05, "loss": 1.5556, "step": 495000 }, { "epoch": 2.548802757130732, "grad_norm": 2.318514108657837, "learning_rate": 5.097858585858586e-05, "loss": 1.5626, "step": 495500 }, { "epoch": 2.5513747074406523, "grad_norm": 2.184453248977661, "learning_rate": 5.092808080808081e-05, "loss": 1.5428, "step": 496000 }, { "epoch": 2.553946657750572, "grad_norm": 2.3431742191314697, "learning_rate": 5.087757575757576e-05, "loss": 1.5507, "step": 496500 }, { "epoch": 2.5565186080604922, "grad_norm": 2.6357996463775635, "learning_rate": 5.082707070707071e-05, "loss": 1.5588, "step": 497000 }, { "epoch": 2.5590905583704124, "grad_norm": 2.3024609088897705, "learning_rate": 5.077656565656566e-05, "loss": 1.5406, "step": 497500 }, { "epoch": 2.561662508680332, "grad_norm": 3.5537869930267334, "learning_rate": 5.072616161616162e-05, "loss": 1.5531, "step": 498000 }, { "epoch": 2.5642344589902524, "grad_norm": 2.6683225631713867, "learning_rate": 5.067565656565657e-05, "loss": 1.5418, "step": 498500 }, { "epoch": 2.566806409300172, "grad_norm": 2.3651461601257324, "learning_rate": 5.062525252525253e-05, "loss": 1.5483, "step": 499000 }, { "epoch": 2.5693783596100923, "grad_norm": 2.525416374206543, "learning_rate": 5.0574747474747477e-05, "loss": 1.5602, "step": 499500 }, { "epoch": 2.5719503099200125, "grad_norm": 2.435364007949829, "learning_rate": 5.052424242424243e-05, "loss": 1.5521, "step": 500000 }, { "epoch": 2.5745222602299322, "grad_norm": 2.486356735229492, "learning_rate": 5.047383838383839e-05, "loss": 1.5585, "step": 500500 }, { "epoch": 2.5770942105398524, "grad_norm": 2.385429859161377, "learning_rate": 5.0423333333333336e-05, "loss": 1.5457, "step": 501000 }, { "epoch": 2.579666160849772, "grad_norm": 2.468360185623169, "learning_rate": 5.037282828282829e-05, "loss": 1.5565, "step": 501500 }, { "epoch": 2.5822381111596924, "grad_norm": 2.0873448848724365, "learning_rate": 5.0322323232323235e-05, "loss": 1.5395, "step": 502000 }, { "epoch": 2.5848100614696126, "grad_norm": 2.7715628147125244, "learning_rate": 5.027181818181819e-05, "loss": 1.5414, "step": 502500 }, { "epoch": 2.5873820117795323, "grad_norm": 2.3114826679229736, "learning_rate": 5.022131313131313e-05, "loss": 1.5587, "step": 503000 }, { "epoch": 2.5899539620894525, "grad_norm": 2.4721546173095703, "learning_rate": 5.017080808080809e-05, "loss": 1.548, "step": 503500 }, { "epoch": 2.5925259123993722, "grad_norm": 2.3029587268829346, "learning_rate": 5.012030303030303e-05, "loss": 1.5513, "step": 504000 }, { "epoch": 2.5950978627092924, "grad_norm": 2.0909407138824463, "learning_rate": 5.006979797979798e-05, "loss": 1.5537, "step": 504500 }, { "epoch": 2.5976698130192126, "grad_norm": 2.5967423915863037, "learning_rate": 5.001929292929293e-05, "loss": 1.5472, "step": 505000 }, { "epoch": 2.6002417633291324, "grad_norm": 2.921551465988159, "learning_rate": 4.996878787878788e-05, "loss": 1.5311, "step": 505500 }, { "epoch": 2.6028137136390526, "grad_norm": 2.4251134395599365, "learning_rate": 4.991828282828283e-05, "loss": 1.5411, "step": 506000 }, { "epoch": 2.6053856639489723, "grad_norm": 2.736292600631714, "learning_rate": 4.986777777777778e-05, "loss": 1.5552, "step": 506500 }, { "epoch": 2.6079576142588925, "grad_norm": 2.5091052055358887, "learning_rate": 4.9817272727272726e-05, "loss": 1.5535, "step": 507000 }, { "epoch": 2.6105295645688127, "grad_norm": 2.42288875579834, "learning_rate": 4.976686868686869e-05, "loss": 1.552, "step": 507500 }, { "epoch": 2.6131015148787324, "grad_norm": 2.5599241256713867, "learning_rate": 4.971646464646465e-05, "loss": 1.5447, "step": 508000 }, { "epoch": 2.6156734651886526, "grad_norm": 2.5007565021514893, "learning_rate": 4.96659595959596e-05, "loss": 1.5493, "step": 508500 }, { "epoch": 2.6182454154985724, "grad_norm": 2.389376401901245, "learning_rate": 4.961545454545455e-05, "loss": 1.5411, "step": 509000 }, { "epoch": 2.6208173658084926, "grad_norm": 2.1207945346832275, "learning_rate": 4.95649494949495e-05, "loss": 1.5445, "step": 509500 }, { "epoch": 2.6233893161184128, "grad_norm": 2.447849750518799, "learning_rate": 4.9514444444444444e-05, "loss": 1.5445, "step": 510000 }, { "epoch": 2.6259612664283325, "grad_norm": 2.1976988315582275, "learning_rate": 4.94639393939394e-05, "loss": 1.5427, "step": 510500 }, { "epoch": 2.6285332167382527, "grad_norm": 3.0653698444366455, "learning_rate": 4.9413535353535356e-05, "loss": 1.5496, "step": 511000 }, { "epoch": 2.6311051670481724, "grad_norm": 2.4654083251953125, "learning_rate": 4.93630303030303e-05, "loss": 1.5482, "step": 511500 }, { "epoch": 2.6336771173580926, "grad_norm": 2.5089690685272217, "learning_rate": 4.9312525252525256e-05, "loss": 1.5525, "step": 512000 }, { "epoch": 2.636249067668013, "grad_norm": 2.4755592346191406, "learning_rate": 4.92620202020202e-05, "loss": 1.5527, "step": 512500 }, { "epoch": 2.6388210179779326, "grad_norm": 2.8626086711883545, "learning_rate": 4.9211515151515155e-05, "loss": 1.5388, "step": 513000 }, { "epoch": 2.6413929682878527, "grad_norm": 2.2445054054260254, "learning_rate": 4.91610101010101e-05, "loss": 1.5513, "step": 513500 }, { "epoch": 2.6439649185977725, "grad_norm": 2.358511447906494, "learning_rate": 4.911050505050505e-05, "loss": 1.5538, "step": 514000 }, { "epoch": 2.6465368689076927, "grad_norm": 2.549711227416992, "learning_rate": 4.906e-05, "loss": 1.5489, "step": 514500 }, { "epoch": 2.649108819217613, "grad_norm": 2.0755455493927, "learning_rate": 4.900949494949495e-05, "loss": 1.5371, "step": 515000 }, { "epoch": 2.6516807695275326, "grad_norm": 2.5039193630218506, "learning_rate": 4.895909090909091e-05, "loss": 1.5404, "step": 515500 }, { "epoch": 2.654252719837453, "grad_norm": 2.759974241256714, "learning_rate": 4.890858585858586e-05, "loss": 1.5441, "step": 516000 }, { "epoch": 2.6568246701473726, "grad_norm": 1.9532139301300049, "learning_rate": 4.8858080808080807e-05, "loss": 1.5497, "step": 516500 }, { "epoch": 2.6593966204572927, "grad_norm": 3.1684305667877197, "learning_rate": 4.880757575757576e-05, "loss": 1.5516, "step": 517000 }, { "epoch": 2.661968570767213, "grad_norm": 2.467054843902588, "learning_rate": 4.8757070707070706e-05, "loss": 1.538, "step": 517500 }, { "epoch": 2.6645405210771327, "grad_norm": 2.3552815914154053, "learning_rate": 4.870656565656566e-05, "loss": 1.5521, "step": 518000 }, { "epoch": 2.667112471387053, "grad_norm": 2.5004801750183105, "learning_rate": 4.865606060606061e-05, "loss": 1.5419, "step": 518500 }, { "epoch": 2.6696844216969726, "grad_norm": 2.8119254112243652, "learning_rate": 4.8605656565656565e-05, "loss": 1.5504, "step": 519000 }, { "epoch": 2.672256372006893, "grad_norm": 2.6918792724609375, "learning_rate": 4.855515151515152e-05, "loss": 1.5384, "step": 519500 }, { "epoch": 2.674828322316813, "grad_norm": 2.4995195865631104, "learning_rate": 4.850464646464647e-05, "loss": 1.5441, "step": 520000 }, { "epoch": 2.6774002726267327, "grad_norm": 2.166651964187622, "learning_rate": 4.845414141414142e-05, "loss": 1.5404, "step": 520500 }, { "epoch": 2.679972222936653, "grad_norm": 2.4418795108795166, "learning_rate": 4.840373737373737e-05, "loss": 1.527, "step": 521000 }, { "epoch": 2.6825441732465727, "grad_norm": 2.3248789310455322, "learning_rate": 4.835323232323233e-05, "loss": 1.5377, "step": 521500 }, { "epoch": 2.685116123556493, "grad_norm": 2.5221030712127686, "learning_rate": 4.830272727272728e-05, "loss": 1.5421, "step": 522000 }, { "epoch": 2.687688073866413, "grad_norm": 2.7731223106384277, "learning_rate": 4.825222222222222e-05, "loss": 1.5382, "step": 522500 }, { "epoch": 2.690260024176333, "grad_norm": 2.157928943634033, "learning_rate": 4.820181818181819e-05, "loss": 1.5402, "step": 523000 }, { "epoch": 2.692831974486253, "grad_norm": 2.3403429985046387, "learning_rate": 4.8151313131313136e-05, "loss": 1.5323, "step": 523500 }, { "epoch": 2.6954039247961727, "grad_norm": 2.8037800788879395, "learning_rate": 4.810080808080808e-05, "loss": 1.5366, "step": 524000 }, { "epoch": 2.697975875106093, "grad_norm": 2.8812320232391357, "learning_rate": 4.8050303030303035e-05, "loss": 1.5489, "step": 524500 }, { "epoch": 2.700547825416013, "grad_norm": 2.4520397186279297, "learning_rate": 4.799979797979798e-05, "loss": 1.5409, "step": 525000 }, { "epoch": 2.703119775725933, "grad_norm": 2.239299774169922, "learning_rate": 4.7949292929292935e-05, "loss": 1.549, "step": 525500 }, { "epoch": 2.705691726035853, "grad_norm": 2.172064781188965, "learning_rate": 4.7898888888888894e-05, "loss": 1.5365, "step": 526000 }, { "epoch": 2.708263676345773, "grad_norm": 2.851077079772949, "learning_rate": 4.784848484848485e-05, "loss": 1.5329, "step": 526500 }, { "epoch": 2.710835626655693, "grad_norm": 2.423591136932373, "learning_rate": 4.77979797979798e-05, "loss": 1.5382, "step": 527000 }, { "epoch": 2.713407576965613, "grad_norm": 2.675351858139038, "learning_rate": 4.774747474747475e-05, "loss": 1.548, "step": 527500 }, { "epoch": 2.715979527275533, "grad_norm": 2.165239095687866, "learning_rate": 4.76969696969697e-05, "loss": 1.5423, "step": 528000 }, { "epoch": 2.718551477585453, "grad_norm": 2.6030383110046387, "learning_rate": 4.764646464646465e-05, "loss": 1.5309, "step": 528500 }, { "epoch": 2.721123427895373, "grad_norm": 2.359309196472168, "learning_rate": 4.75959595959596e-05, "loss": 1.5286, "step": 529000 }, { "epoch": 2.723695378205293, "grad_norm": 2.1645898818969727, "learning_rate": 4.7545454545454545e-05, "loss": 1.5376, "step": 529500 }, { "epoch": 2.7262673285152133, "grad_norm": 2.3792974948883057, "learning_rate": 4.74949494949495e-05, "loss": 1.5367, "step": 530000 }, { "epoch": 2.728839278825133, "grad_norm": 2.7375681400299072, "learning_rate": 4.7444444444444445e-05, "loss": 1.5249, "step": 530500 }, { "epoch": 2.731411229135053, "grad_norm": 2.417910099029541, "learning_rate": 4.7394040404040405e-05, "loss": 1.534, "step": 531000 }, { "epoch": 2.733983179444973, "grad_norm": 2.386302947998047, "learning_rate": 4.734363636363637e-05, "loss": 1.538, "step": 531500 }, { "epoch": 2.736555129754893, "grad_norm": 2.2796523571014404, "learning_rate": 4.729313131313132e-05, "loss": 1.5281, "step": 532000 }, { "epoch": 2.7391270800648133, "grad_norm": 2.4717445373535156, "learning_rate": 4.7242626262626264e-05, "loss": 1.5375, "step": 532500 }, { "epoch": 2.741699030374733, "grad_norm": 2.348935842514038, "learning_rate": 4.719212121212122e-05, "loss": 1.5303, "step": 533000 }, { "epoch": 2.7442709806846532, "grad_norm": 2.4359893798828125, "learning_rate": 4.714161616161616e-05, "loss": 1.5418, "step": 533500 }, { "epoch": 2.746842930994573, "grad_norm": 3.118255853652954, "learning_rate": 4.7091111111111116e-05, "loss": 1.538, "step": 534000 }, { "epoch": 2.749414881304493, "grad_norm": 2.450284004211426, "learning_rate": 4.704060606060606e-05, "loss": 1.5321, "step": 534500 }, { "epoch": 2.7519868316144134, "grad_norm": 2.3103396892547607, "learning_rate": 4.699010101010101e-05, "loss": 1.5388, "step": 535000 }, { "epoch": 2.754558781924333, "grad_norm": 2.439276695251465, "learning_rate": 4.693959595959596e-05, "loss": 1.5228, "step": 535500 }, { "epoch": 2.7571307322342533, "grad_norm": 2.310704231262207, "learning_rate": 4.688909090909091e-05, "loss": 1.52, "step": 536000 }, { "epoch": 2.759702682544173, "grad_norm": 3.0740084648132324, "learning_rate": 4.683868686868687e-05, "loss": 1.5354, "step": 536500 }, { "epoch": 2.7622746328540932, "grad_norm": 2.635913848876953, "learning_rate": 4.678818181818182e-05, "loss": 1.5301, "step": 537000 }, { "epoch": 2.7648465831640134, "grad_norm": 2.3458645343780518, "learning_rate": 4.673767676767677e-05, "loss": 1.5213, "step": 537500 }, { "epoch": 2.767418533473933, "grad_norm": 2.191563367843628, "learning_rate": 4.668717171717172e-05, "loss": 1.5309, "step": 538000 }, { "epoch": 2.7699904837838534, "grad_norm": 2.256751537322998, "learning_rate": 4.663666666666667e-05, "loss": 1.5254, "step": 538500 }, { "epoch": 2.772562434093773, "grad_norm": 2.0021133422851562, "learning_rate": 4.658616161616162e-05, "loss": 1.5261, "step": 539000 }, { "epoch": 2.7751343844036933, "grad_norm": 2.282194137573242, "learning_rate": 4.6535656565656566e-05, "loss": 1.5275, "step": 539500 }, { "epoch": 2.7777063347136135, "grad_norm": 2.4739699363708496, "learning_rate": 4.6485252525252526e-05, "loss": 1.5292, "step": 540000 }, { "epoch": 2.7802782850235332, "grad_norm": 2.498216390609741, "learning_rate": 4.643474747474747e-05, "loss": 1.5248, "step": 540500 }, { "epoch": 2.7828502353334534, "grad_norm": 2.388746738433838, "learning_rate": 4.6384242424242425e-05, "loss": 1.5217, "step": 541000 }, { "epoch": 2.785422185643373, "grad_norm": 2.673908233642578, "learning_rate": 4.633373737373737e-05, "loss": 1.5309, "step": 541500 }, { "epoch": 2.7879941359532934, "grad_norm": 2.3223876953125, "learning_rate": 4.628333333333333e-05, "loss": 1.5313, "step": 542000 }, { "epoch": 2.7905660862632136, "grad_norm": 2.03485369682312, "learning_rate": 4.62329292929293e-05, "loss": 1.5357, "step": 542500 }, { "epoch": 2.7931380365731333, "grad_norm": 2.342752456665039, "learning_rate": 4.6182424242424244e-05, "loss": 1.5327, "step": 543000 }, { "epoch": 2.7957099868830535, "grad_norm": 2.879817008972168, "learning_rate": 4.613191919191919e-05, "loss": 1.5384, "step": 543500 }, { "epoch": 2.7982819371929732, "grad_norm": 2.0930681228637695, "learning_rate": 4.608141414141414e-05, "loss": 1.5246, "step": 544000 }, { "epoch": 2.8008538875028934, "grad_norm": 2.341869592666626, "learning_rate": 4.603090909090909e-05, "loss": 1.5255, "step": 544500 }, { "epoch": 2.8034258378128136, "grad_norm": 2.309088945388794, "learning_rate": 4.598040404040404e-05, "loss": 1.5201, "step": 545000 }, { "epoch": 2.8059977881227334, "grad_norm": 2.4833176136016846, "learning_rate": 4.592989898989899e-05, "loss": 1.5327, "step": 545500 }, { "epoch": 2.8085697384326536, "grad_norm": 2.2396302223205566, "learning_rate": 4.587939393939394e-05, "loss": 1.52, "step": 546000 }, { "epoch": 2.8111416887425733, "grad_norm": 2.740811586380005, "learning_rate": 4.5828888888888895e-05, "loss": 1.5327, "step": 546500 }, { "epoch": 2.8137136390524935, "grad_norm": 3.175210952758789, "learning_rate": 4.577848484848485e-05, "loss": 1.529, "step": 547000 }, { "epoch": 2.8162855893624137, "grad_norm": 2.597053050994873, "learning_rate": 4.5727979797979795e-05, "loss": 1.5238, "step": 547500 }, { "epoch": 2.8188575396723334, "grad_norm": 2.355821132659912, "learning_rate": 4.5677474747474754e-05, "loss": 1.53, "step": 548000 }, { "epoch": 2.8214294899822536, "grad_norm": 2.92700457572937, "learning_rate": 4.56269696969697e-05, "loss": 1.5212, "step": 548500 }, { "epoch": 2.8240014402921734, "grad_norm": 2.831411838531494, "learning_rate": 4.557646464646465e-05, "loss": 1.527, "step": 549000 }, { "epoch": 2.8265733906020936, "grad_norm": 2.444070816040039, "learning_rate": 4.5526060606060614e-05, "loss": 1.535, "step": 549500 }, { "epoch": 2.8291453409120138, "grad_norm": 2.4589648246765137, "learning_rate": 4.547555555555556e-05, "loss": 1.5284, "step": 550000 }, { "epoch": 2.8317172912219335, "grad_norm": 2.601458787918091, "learning_rate": 4.5425050505050506e-05, "loss": 1.5281, "step": 550500 }, { "epoch": 2.8342892415318537, "grad_norm": 2.6681647300720215, "learning_rate": 4.537454545454546e-05, "loss": 1.5211, "step": 551000 }, { "epoch": 2.8368611918417734, "grad_norm": 2.3051042556762695, "learning_rate": 4.5324040404040406e-05, "loss": 1.5246, "step": 551500 }, { "epoch": 2.8394331421516936, "grad_norm": 3.2226884365081787, "learning_rate": 4.527353535353536e-05, "loss": 1.5223, "step": 552000 }, { "epoch": 2.842005092461614, "grad_norm": 2.27409291267395, "learning_rate": 4.5223030303030305e-05, "loss": 1.5167, "step": 552500 }, { "epoch": 2.8445770427715336, "grad_norm": 2.736320734024048, "learning_rate": 4.517272727272727e-05, "loss": 1.5226, "step": 553000 }, { "epoch": 2.8471489930814537, "grad_norm": 2.539435386657715, "learning_rate": 4.5122222222222224e-05, "loss": 1.521, "step": 553500 }, { "epoch": 2.8497209433913735, "grad_norm": 2.52431321144104, "learning_rate": 4.507171717171718e-05, "loss": 1.5321, "step": 554000 }, { "epoch": 2.8522928937012937, "grad_norm": 2.110541343688965, "learning_rate": 4.5021212121212124e-05, "loss": 1.5233, "step": 554500 }, { "epoch": 2.854864844011214, "grad_norm": 2.501573085784912, "learning_rate": 4.497070707070708e-05, "loss": 1.5267, "step": 555000 }, { "epoch": 2.8574367943211336, "grad_norm": 2.4063198566436768, "learning_rate": 4.4920303030303036e-05, "loss": 1.5171, "step": 555500 }, { "epoch": 2.860008744631054, "grad_norm": 3.3333494663238525, "learning_rate": 4.486979797979798e-05, "loss": 1.5044, "step": 556000 }, { "epoch": 2.8625806949409736, "grad_norm": 2.509376049041748, "learning_rate": 4.4819292929292936e-05, "loss": 1.5242, "step": 556500 }, { "epoch": 2.8651526452508937, "grad_norm": 2.505197048187256, "learning_rate": 4.476878787878788e-05, "loss": 1.5293, "step": 557000 }, { "epoch": 2.867724595560814, "grad_norm": 2.4513468742370605, "learning_rate": 4.471828282828283e-05, "loss": 1.5188, "step": 557500 }, { "epoch": 2.8702965458707337, "grad_norm": 2.7993083000183105, "learning_rate": 4.466777777777778e-05, "loss": 1.5223, "step": 558000 }, { "epoch": 2.872868496180654, "grad_norm": 2.5785391330718994, "learning_rate": 4.461727272727273e-05, "loss": 1.5273, "step": 558500 }, { "epoch": 2.8754404464905736, "grad_norm": 2.3784685134887695, "learning_rate": 4.456676767676768e-05, "loss": 1.5144, "step": 559000 }, { "epoch": 2.878012396800494, "grad_norm": 2.3746955394744873, "learning_rate": 4.451636363636364e-05, "loss": 1.505, "step": 559500 }, { "epoch": 2.880584347110414, "grad_norm": 2.898721218109131, "learning_rate": 4.446585858585859e-05, "loss": 1.5281, "step": 560000 }, { "epoch": 2.8831562974203337, "grad_norm": 2.6563735008239746, "learning_rate": 4.441535353535354e-05, "loss": 1.5211, "step": 560500 }, { "epoch": 2.885728247730254, "grad_norm": 2.060058832168579, "learning_rate": 4.4364848484848487e-05, "loss": 1.5165, "step": 561000 }, { "epoch": 2.8883001980401737, "grad_norm": 2.6480188369750977, "learning_rate": 4.431434343434343e-05, "loss": 1.5312, "step": 561500 }, { "epoch": 2.890872148350094, "grad_norm": 2.5112969875335693, "learning_rate": 4.4263838383838386e-05, "loss": 1.5057, "step": 562000 }, { "epoch": 2.893444098660014, "grad_norm": 2.4975204467773438, "learning_rate": 4.421333333333333e-05, "loss": 1.5081, "step": 562500 }, { "epoch": 2.896016048969934, "grad_norm": 2.5974085330963135, "learning_rate": 4.4162828282828286e-05, "loss": 1.5309, "step": 563000 }, { "epoch": 2.898587999279854, "grad_norm": 2.727055788040161, "learning_rate": 4.411232323232323e-05, "loss": 1.5145, "step": 563500 }, { "epoch": 2.9011599495897737, "grad_norm": 2.283076763153076, "learning_rate": 4.406191919191919e-05, "loss": 1.5187, "step": 564000 }, { "epoch": 2.903731899899694, "grad_norm": 2.706749439239502, "learning_rate": 4.4011414141414145e-05, "loss": 1.5075, "step": 564500 }, { "epoch": 2.906303850209614, "grad_norm": 2.3458900451660156, "learning_rate": 4.396090909090909e-05, "loss": 1.5225, "step": 565000 }, { "epoch": 2.908875800519534, "grad_norm": 2.2899625301361084, "learning_rate": 4.3910404040404044e-05, "loss": 1.5058, "step": 565500 }, { "epoch": 2.911447750829454, "grad_norm": 2.800731658935547, "learning_rate": 4.385989898989899e-05, "loss": 1.5203, "step": 566000 }, { "epoch": 2.914019701139374, "grad_norm": 2.9070866107940674, "learning_rate": 4.380949494949495e-05, "loss": 1.5254, "step": 566500 }, { "epoch": 2.916591651449294, "grad_norm": 2.3995327949523926, "learning_rate": 4.3758989898989896e-05, "loss": 1.5264, "step": 567000 }, { "epoch": 2.919163601759214, "grad_norm": 3.0303332805633545, "learning_rate": 4.370848484848485e-05, "loss": 1.5199, "step": 567500 }, { "epoch": 2.921735552069134, "grad_norm": 2.392720937728882, "learning_rate": 4.3657979797979796e-05, "loss": 1.5069, "step": 568000 }, { "epoch": 2.924307502379054, "grad_norm": 2.5260987281799316, "learning_rate": 4.3607575757575755e-05, "loss": 1.5239, "step": 568500 }, { "epoch": 2.926879452688974, "grad_norm": 1.9965590238571167, "learning_rate": 4.355707070707071e-05, "loss": 1.5112, "step": 569000 }, { "epoch": 2.929451402998894, "grad_norm": 2.7305872440338135, "learning_rate": 4.3506565656565655e-05, "loss": 1.5222, "step": 569500 }, { "epoch": 2.9320233533088143, "grad_norm": 2.196129083633423, "learning_rate": 4.345606060606061e-05, "loss": 1.5237, "step": 570000 }, { "epoch": 2.934595303618734, "grad_norm": 2.489001750946045, "learning_rate": 4.3405555555555554e-05, "loss": 1.5122, "step": 570500 }, { "epoch": 2.937167253928654, "grad_norm": 2.8367908000946045, "learning_rate": 4.335505050505051e-05, "loss": 1.5113, "step": 571000 }, { "epoch": 2.939739204238574, "grad_norm": 2.413041114807129, "learning_rate": 4.330454545454546e-05, "loss": 1.516, "step": 571500 }, { "epoch": 2.942311154548494, "grad_norm": 2.2877037525177, "learning_rate": 4.325404040404041e-05, "loss": 1.5172, "step": 572000 }, { "epoch": 2.9448831048584143, "grad_norm": 2.668660879135132, "learning_rate": 4.3203636363636366e-05, "loss": 1.5107, "step": 572500 }, { "epoch": 2.947455055168334, "grad_norm": 3.0024032592773438, "learning_rate": 4.315313131313132e-05, "loss": 1.5144, "step": 573000 }, { "epoch": 2.9500270054782542, "grad_norm": 2.159865617752075, "learning_rate": 4.3102626262626266e-05, "loss": 1.5036, "step": 573500 }, { "epoch": 2.952598955788174, "grad_norm": 2.5722429752349854, "learning_rate": 4.305212121212122e-05, "loss": 1.5255, "step": 574000 }, { "epoch": 2.955170906098094, "grad_norm": 2.755248546600342, "learning_rate": 4.300171717171718e-05, "loss": 1.5051, "step": 574500 }, { "epoch": 2.9577428564080144, "grad_norm": 2.2805163860321045, "learning_rate": 4.2951212121212125e-05, "loss": 1.5221, "step": 575000 }, { "epoch": 2.960314806717934, "grad_norm": 3.676866292953491, "learning_rate": 4.290070707070707e-05, "loss": 1.5145, "step": 575500 }, { "epoch": 2.9628867570278543, "grad_norm": 2.105748414993286, "learning_rate": 4.2850202020202024e-05, "loss": 1.5063, "step": 576000 }, { "epoch": 2.965458707337774, "grad_norm": 2.225126266479492, "learning_rate": 4.279969696969697e-05, "loss": 1.5101, "step": 576500 }, { "epoch": 2.9680306576476942, "grad_norm": 2.7732033729553223, "learning_rate": 4.274929292929293e-05, "loss": 1.5072, "step": 577000 }, { "epoch": 2.9706026079576144, "grad_norm": 2.483477830886841, "learning_rate": 4.2698787878787883e-05, "loss": 1.5167, "step": 577500 }, { "epoch": 2.973174558267534, "grad_norm": 2.6519720554351807, "learning_rate": 4.264828282828283e-05, "loss": 1.5196, "step": 578000 }, { "epoch": 2.9757465085774544, "grad_norm": 2.3944153785705566, "learning_rate": 4.259777777777778e-05, "loss": 1.5184, "step": 578500 }, { "epoch": 2.978318458887374, "grad_norm": 2.850205183029175, "learning_rate": 4.254727272727273e-05, "loss": 1.5124, "step": 579000 }, { "epoch": 2.9808904091972943, "grad_norm": 1.9868264198303223, "learning_rate": 4.249676767676768e-05, "loss": 1.5045, "step": 579500 }, { "epoch": 2.9834623595072145, "grad_norm": 2.709223985671997, "learning_rate": 4.244636363636364e-05, "loss": 1.512, "step": 580000 }, { "epoch": 2.9860343098171342, "grad_norm": 2.369521141052246, "learning_rate": 4.239585858585859e-05, "loss": 1.5121, "step": 580500 }, { "epoch": 2.9886062601270544, "grad_norm": 2.712256669998169, "learning_rate": 4.2345353535353535e-05, "loss": 1.5157, "step": 581000 }, { "epoch": 2.991178210436974, "grad_norm": 2.4199235439300537, "learning_rate": 4.229484848484849e-05, "loss": 1.5116, "step": 581500 }, { "epoch": 2.9937501607468944, "grad_norm": 2.4358603954315186, "learning_rate": 4.2244343434343434e-05, "loss": 1.5268, "step": 582000 }, { "epoch": 2.9963221110568146, "grad_norm": 2.8168931007385254, "learning_rate": 4.2193939393939394e-05, "loss": 1.5058, "step": 582500 }, { "epoch": 2.9988940613667343, "grad_norm": 2.282642364501953, "learning_rate": 4.214343434343435e-05, "loss": 1.4964, "step": 583000 }, { "epoch": 3.0014660116766545, "grad_norm": 2.6705520153045654, "learning_rate": 4.209292929292929e-05, "loss": 1.5017, "step": 583500 }, { "epoch": 3.0040379619865742, "grad_norm": 2.6078131198883057, "learning_rate": 4.2042424242424246e-05, "loss": 1.4998, "step": 584000 }, { "epoch": 3.0066099122964944, "grad_norm": 2.8063297271728516, "learning_rate": 4.1992020202020206e-05, "loss": 1.497, "step": 584500 }, { "epoch": 3.0091818626064146, "grad_norm": 2.291599750518799, "learning_rate": 4.194151515151515e-05, "loss": 1.5016, "step": 585000 }, { "epoch": 3.0117538129163344, "grad_norm": 2.3349857330322266, "learning_rate": 4.1891010101010105e-05, "loss": 1.4991, "step": 585500 }, { "epoch": 3.0143257632262546, "grad_norm": 2.4059336185455322, "learning_rate": 4.184050505050505e-05, "loss": 1.5076, "step": 586000 }, { "epoch": 3.0168977135361743, "grad_norm": 1.973617672920227, "learning_rate": 4.179e-05, "loss": 1.497, "step": 586500 }, { "epoch": 3.0194696638460945, "grad_norm": 2.9620471000671387, "learning_rate": 4.1739595959595964e-05, "loss": 1.5149, "step": 587000 }, { "epoch": 3.0220416141560147, "grad_norm": 2.4589638710021973, "learning_rate": 4.168909090909091e-05, "loss": 1.5073, "step": 587500 }, { "epoch": 3.0246135644659344, "grad_norm": 2.5346004962921143, "learning_rate": 4.163858585858586e-05, "loss": 1.5117, "step": 588000 }, { "epoch": 3.0271855147758546, "grad_norm": 2.4980521202087402, "learning_rate": 4.158808080808081e-05, "loss": 1.5104, "step": 588500 }, { "epoch": 3.0297574650857744, "grad_norm": 2.5343849658966064, "learning_rate": 4.1537575757575756e-05, "loss": 1.4911, "step": 589000 }, { "epoch": 3.0323294153956946, "grad_norm": 2.3915090560913086, "learning_rate": 4.148707070707071e-05, "loss": 1.5047, "step": 589500 }, { "epoch": 3.0349013657056148, "grad_norm": 2.2896182537078857, "learning_rate": 4.1436565656565656e-05, "loss": 1.4912, "step": 590000 }, { "epoch": 3.0374733160155345, "grad_norm": 2.66957426071167, "learning_rate": 4.1386161616161616e-05, "loss": 1.5044, "step": 590500 }, { "epoch": 3.0400452663254547, "grad_norm": 2.3858649730682373, "learning_rate": 4.133565656565657e-05, "loss": 1.5103, "step": 591000 }, { "epoch": 3.0426172166353744, "grad_norm": 2.154978036880493, "learning_rate": 4.128525252525253e-05, "loss": 1.5072, "step": 591500 }, { "epoch": 3.0451891669452946, "grad_norm": 2.9559261798858643, "learning_rate": 4.1234747474747475e-05, "loss": 1.5125, "step": 592000 }, { "epoch": 3.047761117255215, "grad_norm": 2.4529426097869873, "learning_rate": 4.118424242424243e-05, "loss": 1.4832, "step": 592500 }, { "epoch": 3.0503330675651346, "grad_norm": 2.664656162261963, "learning_rate": 4.1133737373737374e-05, "loss": 1.4932, "step": 593000 }, { "epoch": 3.0529050178750547, "grad_norm": 2.5239176750183105, "learning_rate": 4.108323232323232e-05, "loss": 1.4963, "step": 593500 }, { "epoch": 3.0554769681849745, "grad_norm": 2.7687795162200928, "learning_rate": 4.1032727272727274e-05, "loss": 1.5024, "step": 594000 }, { "epoch": 3.0580489184948947, "grad_norm": 2.636725425720215, "learning_rate": 4.098222222222222e-05, "loss": 1.5095, "step": 594500 }, { "epoch": 3.060620868804815, "grad_norm": 2.643148899078369, "learning_rate": 4.093171717171717e-05, "loss": 1.5023, "step": 595000 }, { "epoch": 3.0631928191147346, "grad_norm": 2.728957176208496, "learning_rate": 4.0881212121212126e-05, "loss": 1.5057, "step": 595500 }, { "epoch": 3.065764769424655, "grad_norm": 2.0928802490234375, "learning_rate": 4.083070707070707e-05, "loss": 1.5028, "step": 596000 }, { "epoch": 3.0683367197345746, "grad_norm": 2.6500329971313477, "learning_rate": 4.078030303030303e-05, "loss": 1.5014, "step": 596500 }, { "epoch": 3.0709086700444947, "grad_norm": 3.050570249557495, "learning_rate": 4.0729797979797985e-05, "loss": 1.4955, "step": 597000 }, { "epoch": 3.073480620354415, "grad_norm": 2.7134509086608887, "learning_rate": 4.067929292929293e-05, "loss": 1.4975, "step": 597500 }, { "epoch": 3.0760525706643347, "grad_norm": 2.270643711090088, "learning_rate": 4.0628787878787885e-05, "loss": 1.4995, "step": 598000 }, { "epoch": 3.078624520974255, "grad_norm": 2.371786594390869, "learning_rate": 4.057828282828283e-05, "loss": 1.5082, "step": 598500 }, { "epoch": 3.0811964712841746, "grad_norm": 2.286396026611328, "learning_rate": 4.052787878787879e-05, "loss": 1.5081, "step": 599000 }, { "epoch": 3.083768421594095, "grad_norm": 2.2606699466705322, "learning_rate": 4.0477373737373744e-05, "loss": 1.4959, "step": 599500 }, { "epoch": 3.086340371904015, "grad_norm": 2.225919008255005, "learning_rate": 4.0426969696969696e-05, "loss": 1.5053, "step": 600000 } ], "logging_steps": 500, "max_steps": 1000000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0117530008615338e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }