{ "best_global_step": 7000, "best_metric": 0.7519673705101013, "best_model_checkpoint": "./final_weather_vlm_outputs2/checkpoint-7000", "epoch": 2.911837350180693, "eval_steps": 500, "global_step": 7000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00041598419260068117, "grad_norm": 2.07293701171875, "learning_rate": 0.0, "loss": 3.2992, "step": 1 }, { "epoch": 0.0008319683852013623, "grad_norm": 2.1056036949157715, "learning_rate": 1.0000000000000001e-07, "loss": 3.3088, "step": 2 }, { "epoch": 0.0012479525778020435, "grad_norm": 2.2178235054016113, "learning_rate": 2.0000000000000002e-07, "loss": 3.2469, "step": 3 }, { "epoch": 0.0016639367704027247, "grad_norm": 2.372420072555542, "learning_rate": 3.0000000000000004e-07, "loss": 3.3537, "step": 4 }, { "epoch": 0.002079920963003406, "grad_norm": 2.3263468742370605, "learning_rate": 4.0000000000000003e-07, "loss": 3.4114, "step": 5 }, { "epoch": 0.002495905155604087, "grad_norm": 2.169512987136841, "learning_rate": 5.000000000000001e-07, "loss": 3.5769, "step": 6 }, { "epoch": 0.0029118893482047684, "grad_norm": 2.3013832569122314, "learning_rate": 6.000000000000001e-07, "loss": 3.3392, "step": 7 }, { "epoch": 0.0033278735408054493, "grad_norm": 2.0958638191223145, "learning_rate": 7.000000000000001e-07, "loss": 3.2619, "step": 8 }, { "epoch": 0.0037438577334061307, "grad_norm": 1.9328453540802002, "learning_rate": 8.000000000000001e-07, "loss": 3.2131, "step": 9 }, { "epoch": 0.004159841926006812, "grad_norm": 2.3789584636688232, "learning_rate": 9.000000000000001e-07, "loss": 3.406, "step": 10 }, { "epoch": 0.004575826118607493, "grad_norm": 1.9362244606018066, "learning_rate": 1.0000000000000002e-06, "loss": 3.2308, "step": 11 }, { "epoch": 0.004991810311208174, "grad_norm": 2.134359121322632, "learning_rate": 1.1e-06, "loss": 3.3589, "step": 12 }, { "epoch": 0.005407794503808855, "grad_norm": 2.2601304054260254, "learning_rate": 1.2000000000000002e-06, "loss": 3.5222, "step": 13 }, { "epoch": 0.005823778696409537, "grad_norm": 2.554723024368286, "learning_rate": 1.3e-06, "loss": 3.6675, "step": 14 }, { "epoch": 0.006239762889010217, "grad_norm": 2.717844247817993, "learning_rate": 1.4000000000000001e-06, "loss": 3.6646, "step": 15 }, { "epoch": 0.006655747081610899, "grad_norm": 1.9478408098220825, "learning_rate": 1.5e-06, "loss": 3.3614, "step": 16 }, { "epoch": 0.00707173127421158, "grad_norm": 2.225015640258789, "learning_rate": 1.6000000000000001e-06, "loss": 3.2957, "step": 17 }, { "epoch": 0.007487715466812261, "grad_norm": 2.103025197982788, "learning_rate": 1.7000000000000002e-06, "loss": 3.228, "step": 18 }, { "epoch": 0.007903699659412943, "grad_norm": 1023.1921997070312, "learning_rate": 1.8000000000000001e-06, "loss": 3.3082, "step": 19 }, { "epoch": 0.008319683852013624, "grad_norm": 1.9872760772705078, "learning_rate": 1.9000000000000002e-06, "loss": 3.1915, "step": 20 }, { "epoch": 0.008735668044614304, "grad_norm": 1.939089059829712, "learning_rate": 2.0000000000000003e-06, "loss": 3.2984, "step": 21 }, { "epoch": 0.009151652237214985, "grad_norm": 2.6201746463775635, "learning_rate": 2.1000000000000002e-06, "loss": 3.5615, "step": 22 }, { "epoch": 0.009567636429815667, "grad_norm": 2.252861499786377, "learning_rate": 2.2e-06, "loss": 3.2728, "step": 23 }, { "epoch": 0.009983620622416348, "grad_norm": 2.330608367919922, "learning_rate": 2.3000000000000004e-06, "loss": 3.526, "step": 24 }, { "epoch": 0.01039960481501703, "grad_norm": 2.6205179691314697, "learning_rate": 2.4000000000000003e-06, "loss": 3.546, "step": 25 }, { "epoch": 0.01081558900761771, "grad_norm": 2.249379873275757, "learning_rate": 2.5e-06, "loss": 3.3414, "step": 26 }, { "epoch": 0.011231573200218392, "grad_norm": 2.604339122772217, "learning_rate": 2.6e-06, "loss": 3.5559, "step": 27 }, { "epoch": 0.011647557392819073, "grad_norm": 4.050002098083496, "learning_rate": 2.7000000000000004e-06, "loss": 3.3291, "step": 28 }, { "epoch": 0.012063541585419755, "grad_norm": 2.3841638565063477, "learning_rate": 2.8000000000000003e-06, "loss": 3.3813, "step": 29 }, { "epoch": 0.012479525778020435, "grad_norm": 2.857261896133423, "learning_rate": 2.9e-06, "loss": 3.4393, "step": 30 }, { "epoch": 0.012895509970621116, "grad_norm": 561048.8125, "learning_rate": 3e-06, "loss": 3.1854, "step": 31 }, { "epoch": 0.013311494163221797, "grad_norm": 2.4817116260528564, "learning_rate": 3.1000000000000004e-06, "loss": 3.3519, "step": 32 }, { "epoch": 0.013727478355822479, "grad_norm": 2.234112501144409, "learning_rate": 3.2000000000000003e-06, "loss": 3.1944, "step": 33 }, { "epoch": 0.01414346254842316, "grad_norm": 2.3162436485290527, "learning_rate": 3.3000000000000006e-06, "loss": 3.2364, "step": 34 }, { "epoch": 0.014559446741023841, "grad_norm": 2.605085611343384, "learning_rate": 3.4000000000000005e-06, "loss": 3.5203, "step": 35 }, { "epoch": 0.014975430933624523, "grad_norm": 2.3382327556610107, "learning_rate": 3.5e-06, "loss": 3.1759, "step": 36 }, { "epoch": 0.015391415126225204, "grad_norm": 2.6252944469451904, "learning_rate": 3.6000000000000003e-06, "loss": 3.3798, "step": 37 }, { "epoch": 0.015807399318825886, "grad_norm": 2.1962482929229736, "learning_rate": 3.7e-06, "loss": 3.2292, "step": 38 }, { "epoch": 0.016223383511426565, "grad_norm": 3.039541721343994, "learning_rate": 3.8000000000000005e-06, "loss": 3.6186, "step": 39 }, { "epoch": 0.01663936770402725, "grad_norm": 2.4213030338287354, "learning_rate": 3.900000000000001e-06, "loss": 3.1927, "step": 40 }, { "epoch": 0.017055351896627928, "grad_norm": 2.3794989585876465, "learning_rate": 4.000000000000001e-06, "loss": 3.1007, "step": 41 }, { "epoch": 0.017471336089228608, "grad_norm": 2.6517488956451416, "learning_rate": 4.1e-06, "loss": 3.2419, "step": 42 }, { "epoch": 0.01788732028182929, "grad_norm": 2.549868106842041, "learning_rate": 4.2000000000000004e-06, "loss": 3.2482, "step": 43 }, { "epoch": 0.01830330447442997, "grad_norm": 2.6036036014556885, "learning_rate": 4.3e-06, "loss": 3.3473, "step": 44 }, { "epoch": 0.018719288667030654, "grad_norm": 2.4004437923431396, "learning_rate": 4.4e-06, "loss": 3.0934, "step": 45 }, { "epoch": 0.019135272859631333, "grad_norm": 2.6837596893310547, "learning_rate": 4.5e-06, "loss": 3.2273, "step": 46 }, { "epoch": 0.019551257052232016, "grad_norm": 2.7468881607055664, "learning_rate": 4.600000000000001e-06, "loss": 3.0684, "step": 47 }, { "epoch": 0.019967241244832696, "grad_norm": 2.8076424598693848, "learning_rate": 4.7e-06, "loss": 3.368, "step": 48 }, { "epoch": 0.02038322543743338, "grad_norm": 2.3018345832824707, "learning_rate": 4.800000000000001e-06, "loss": 2.8904, "step": 49 }, { "epoch": 0.02079920963003406, "grad_norm": 2.453238010406494, "learning_rate": 4.9000000000000005e-06, "loss": 3.0713, "step": 50 }, { "epoch": 0.02121519382263474, "grad_norm": 2.014765977859497, "learning_rate": 5e-06, "loss": 2.8632, "step": 51 }, { "epoch": 0.02163117801523542, "grad_norm": 2.03943133354187, "learning_rate": 5.1e-06, "loss": 2.8092, "step": 52 }, { "epoch": 0.0220471622078361, "grad_norm": 2.5279381275177, "learning_rate": 5.2e-06, "loss": 2.9927, "step": 53 }, { "epoch": 0.022463146400436784, "grad_norm": 2.0891199111938477, "learning_rate": 5.300000000000001e-06, "loss": 2.7749, "step": 54 }, { "epoch": 0.022879130593037464, "grad_norm": 2.076598882675171, "learning_rate": 5.400000000000001e-06, "loss": 2.8289, "step": 55 }, { "epoch": 0.023295114785638147, "grad_norm": 2.1997528076171875, "learning_rate": 5.500000000000001e-06, "loss": 2.9231, "step": 56 }, { "epoch": 0.023711098978238827, "grad_norm": 1.9822112321853638, "learning_rate": 5.600000000000001e-06, "loss": 2.7399, "step": 57 }, { "epoch": 0.02412708317083951, "grad_norm": 2.1892733573913574, "learning_rate": 5.7e-06, "loss": 2.8552, "step": 58 }, { "epoch": 0.02454306736344019, "grad_norm": 2.5206761360168457, "learning_rate": 5.8e-06, "loss": 2.8422, "step": 59 }, { "epoch": 0.02495905155604087, "grad_norm": 2.17622709274292, "learning_rate": 5.9e-06, "loss": 2.8519, "step": 60 }, { "epoch": 0.025375035748641552, "grad_norm": 2.627504348754883, "learning_rate": 6e-06, "loss": 3.0548, "step": 61 }, { "epoch": 0.025791019941242232, "grad_norm": 2.1194190979003906, "learning_rate": 6.1e-06, "loss": 2.5655, "step": 62 }, { "epoch": 0.026207004133842915, "grad_norm": 1.9074856042861938, "learning_rate": 6.200000000000001e-06, "loss": 2.6804, "step": 63 }, { "epoch": 0.026622988326443595, "grad_norm": 1.9552083015441895, "learning_rate": 6.300000000000001e-06, "loss": 2.7159, "step": 64 }, { "epoch": 0.027038972519044278, "grad_norm": 1.8101811408996582, "learning_rate": 6.4000000000000006e-06, "loss": 2.5642, "step": 65 }, { "epoch": 0.027454956711644957, "grad_norm": 1.9567184448242188, "learning_rate": 6.5000000000000004e-06, "loss": 2.6366, "step": 66 }, { "epoch": 0.027870940904245637, "grad_norm": 1.7584588527679443, "learning_rate": 6.600000000000001e-06, "loss": 2.4862, "step": 67 }, { "epoch": 0.02828692509684632, "grad_norm": 1.6010130643844604, "learning_rate": 6.700000000000001e-06, "loss": 2.3787, "step": 68 }, { "epoch": 0.028702909289447, "grad_norm": 1.7011308670043945, "learning_rate": 6.800000000000001e-06, "loss": 2.319, "step": 69 }, { "epoch": 0.029118893482047683, "grad_norm": 1.4489129781723022, "learning_rate": 6.9e-06, "loss": 2.4028, "step": 70 }, { "epoch": 0.029534877674648363, "grad_norm": 1.3661638498306274, "learning_rate": 7e-06, "loss": 2.268, "step": 71 }, { "epoch": 0.029950861867249046, "grad_norm": 1.2515079975128174, "learning_rate": 7.100000000000001e-06, "loss": 2.3403, "step": 72 }, { "epoch": 0.030366846059849725, "grad_norm": 1.399421215057373, "learning_rate": 7.2000000000000005e-06, "loss": 2.3096, "step": 73 }, { "epoch": 0.03078283025245041, "grad_norm": 1.2910916805267334, "learning_rate": 7.3e-06, "loss": 2.2889, "step": 74 }, { "epoch": 0.031198814445051088, "grad_norm": 1.2009810209274292, "learning_rate": 7.4e-06, "loss": 2.1641, "step": 75 }, { "epoch": 0.03161479863765177, "grad_norm": 1.1969822645187378, "learning_rate": 7.500000000000001e-06, "loss": 2.1389, "step": 76 }, { "epoch": 0.03203078283025245, "grad_norm": 1.096006155014038, "learning_rate": 7.600000000000001e-06, "loss": 2.0575, "step": 77 }, { "epoch": 0.03244676702285313, "grad_norm": 1.1549434661865234, "learning_rate": 7.7e-06, "loss": 2.2123, "step": 78 }, { "epoch": 0.03286275121545381, "grad_norm": 1.2333221435546875, "learning_rate": 7.800000000000002e-06, "loss": 2.1679, "step": 79 }, { "epoch": 0.0332787354080545, "grad_norm": 1.3679039478302002, "learning_rate": 7.9e-06, "loss": 2.2207, "step": 80 }, { "epoch": 0.033694719600655176, "grad_norm": 1.1284209489822388, "learning_rate": 8.000000000000001e-06, "loss": 1.9803, "step": 81 }, { "epoch": 0.034110703793255856, "grad_norm": 1.336327075958252, "learning_rate": 8.1e-06, "loss": 2.0917, "step": 82 }, { "epoch": 0.034526687985856536, "grad_norm": 1.1321452856063843, "learning_rate": 8.2e-06, "loss": 2.0821, "step": 83 }, { "epoch": 0.034942672178457215, "grad_norm": 9.24336051940918, "learning_rate": 8.3e-06, "loss": 1.9149, "step": 84 }, { "epoch": 0.0353586563710579, "grad_norm": 1.1124049425125122, "learning_rate": 8.400000000000001e-06, "loss": 2.0916, "step": 85 }, { "epoch": 0.03577464056365858, "grad_norm": 1.1203227043151855, "learning_rate": 8.5e-06, "loss": 2.0271, "step": 86 }, { "epoch": 0.03619062475625926, "grad_norm": 1.303850769996643, "learning_rate": 8.6e-06, "loss": 1.933, "step": 87 }, { "epoch": 0.03660660894885994, "grad_norm": 0.9983132481575012, "learning_rate": 8.700000000000001e-06, "loss": 1.8723, "step": 88 }, { "epoch": 0.03702259314146063, "grad_norm": 1.1288799047470093, "learning_rate": 8.8e-06, "loss": 1.9042, "step": 89 }, { "epoch": 0.03743857733406131, "grad_norm": 1.1111818552017212, "learning_rate": 8.900000000000001e-06, "loss": 1.996, "step": 90 }, { "epoch": 0.03785456152666199, "grad_norm": 1.0871027708053589, "learning_rate": 9e-06, "loss": 1.8303, "step": 91 }, { "epoch": 0.038270545719262666, "grad_norm": 1.101183533668518, "learning_rate": 9.100000000000001e-06, "loss": 1.9071, "step": 92 }, { "epoch": 0.038686529911863346, "grad_norm": 1.1393071413040161, "learning_rate": 9.200000000000002e-06, "loss": 2.0196, "step": 93 }, { "epoch": 0.03910251410446403, "grad_norm": 1.2939165830612183, "learning_rate": 9.3e-06, "loss": 1.9767, "step": 94 }, { "epoch": 0.03951849829706471, "grad_norm": 1.1663997173309326, "learning_rate": 9.4e-06, "loss": 1.7088, "step": 95 }, { "epoch": 0.03993448248966539, "grad_norm": 1.1486742496490479, "learning_rate": 9.5e-06, "loss": 1.7972, "step": 96 }, { "epoch": 0.04035046668226607, "grad_norm": 1.2377172708511353, "learning_rate": 9.600000000000001e-06, "loss": 1.9935, "step": 97 }, { "epoch": 0.04076645087486676, "grad_norm": 1.13186514377594, "learning_rate": 9.7e-06, "loss": 1.8796, "step": 98 }, { "epoch": 0.04118243506746744, "grad_norm": 1.1601018905639648, "learning_rate": 9.800000000000001e-06, "loss": 1.7082, "step": 99 }, { "epoch": 0.04159841926006812, "grad_norm": 1.198343276977539, "learning_rate": 9.9e-06, "loss": 1.8163, "step": 100 }, { "epoch": 0.0420144034526688, "grad_norm": 1.2000610828399658, "learning_rate": 1e-05, "loss": 1.7183, "step": 101 }, { "epoch": 0.04243038764526948, "grad_norm": 1.2052137851715088, "learning_rate": 1.0100000000000002e-05, "loss": 1.8076, "step": 102 }, { "epoch": 0.04284637183787016, "grad_norm": 1.0895349979400635, "learning_rate": 1.02e-05, "loss": 1.8572, "step": 103 }, { "epoch": 0.04326235603047084, "grad_norm": 1.2848676443099976, "learning_rate": 1.0300000000000001e-05, "loss": 1.7185, "step": 104 }, { "epoch": 0.04367834022307152, "grad_norm": 1.316221833229065, "learning_rate": 1.04e-05, "loss": 1.6276, "step": 105 }, { "epoch": 0.0440943244156722, "grad_norm": 1.1208025217056274, "learning_rate": 1.0500000000000001e-05, "loss": 1.6472, "step": 106 }, { "epoch": 0.04451030860827289, "grad_norm": 1.1380650997161865, "learning_rate": 1.0600000000000002e-05, "loss": 1.6004, "step": 107 }, { "epoch": 0.04492629280087357, "grad_norm": 1.2387417554855347, "learning_rate": 1.0700000000000001e-05, "loss": 1.667, "step": 108 }, { "epoch": 0.04534227699347425, "grad_norm": 0.9655488729476929, "learning_rate": 1.0800000000000002e-05, "loss": 1.6883, "step": 109 }, { "epoch": 0.04575826118607493, "grad_norm": 1.1194164752960205, "learning_rate": 1.0900000000000002e-05, "loss": 1.644, "step": 110 }, { "epoch": 0.04617424537867561, "grad_norm": 0.9314253926277161, "learning_rate": 1.1000000000000001e-05, "loss": 1.5916, "step": 111 }, { "epoch": 0.046590229571276294, "grad_norm": 0.8323829770088196, "learning_rate": 1.1100000000000002e-05, "loss": 1.6518, "step": 112 }, { "epoch": 0.047006213763876974, "grad_norm": 0.697010338306427, "learning_rate": 1.1200000000000001e-05, "loss": 1.844, "step": 113 }, { "epoch": 0.04742219795647765, "grad_norm": 0.5888327956199646, "learning_rate": 1.13e-05, "loss": 1.6903, "step": 114 }, { "epoch": 0.04783818214907833, "grad_norm": 0.5670652389526367, "learning_rate": 1.14e-05, "loss": 1.7888, "step": 115 }, { "epoch": 0.04825416634167902, "grad_norm": 0.518058717250824, "learning_rate": 1.15e-05, "loss": 1.7171, "step": 116 }, { "epoch": 0.0486701505342797, "grad_norm": 0.515910267829895, "learning_rate": 1.16e-05, "loss": 1.6914, "step": 117 }, { "epoch": 0.04908613472688038, "grad_norm": 0.5345506072044373, "learning_rate": 1.17e-05, "loss": 1.6514, "step": 118 }, { "epoch": 0.04950211891948106, "grad_norm": 0.532151460647583, "learning_rate": 1.18e-05, "loss": 1.6945, "step": 119 }, { "epoch": 0.04991810311208174, "grad_norm": 0.5066655278205872, "learning_rate": 1.1900000000000001e-05, "loss": 1.6463, "step": 120 }, { "epoch": 0.050334087304682425, "grad_norm": 0.5152735710144043, "learning_rate": 1.2e-05, "loss": 1.6275, "step": 121 }, { "epoch": 0.050750071497283104, "grad_norm": 0.5415101647377014, "learning_rate": 1.2100000000000001e-05, "loss": 1.5861, "step": 122 }, { "epoch": 0.051166055689883784, "grad_norm": 2.0836234092712402, "learning_rate": 1.22e-05, "loss": 1.6889, "step": 123 }, { "epoch": 0.051582039882484464, "grad_norm": 0.5461239814758301, "learning_rate": 1.23e-05, "loss": 1.7034, "step": 124 }, { "epoch": 0.05199802407508515, "grad_norm": 0.6166593432426453, "learning_rate": 1.2400000000000002e-05, "loss": 1.5588, "step": 125 }, { "epoch": 0.05241400826768583, "grad_norm": 0.6564395427703857, "learning_rate": 1.25e-05, "loss": 1.6119, "step": 126 }, { "epoch": 0.05282999246028651, "grad_norm": 0.6909776329994202, "learning_rate": 1.2600000000000001e-05, "loss": 1.5366, "step": 127 }, { "epoch": 0.05324597665288719, "grad_norm": 0.636279821395874, "learning_rate": 1.27e-05, "loss": 1.6393, "step": 128 }, { "epoch": 0.05366196084548787, "grad_norm": 0.6674097180366516, "learning_rate": 1.2800000000000001e-05, "loss": 1.6138, "step": 129 }, { "epoch": 0.054077945038088555, "grad_norm": 0.5479506850242615, "learning_rate": 1.2900000000000002e-05, "loss": 1.6162, "step": 130 }, { "epoch": 0.054493929230689235, "grad_norm": 0.514046311378479, "learning_rate": 1.3000000000000001e-05, "loss": 1.5609, "step": 131 }, { "epoch": 0.054909913423289915, "grad_norm": 0.4569280743598938, "learning_rate": 1.3100000000000002e-05, "loss": 1.5071, "step": 132 }, { "epoch": 0.055325897615890594, "grad_norm": 0.5004767775535583, "learning_rate": 1.3200000000000002e-05, "loss": 1.6035, "step": 133 }, { "epoch": 0.055741881808491274, "grad_norm": 0.47135022282600403, "learning_rate": 1.3300000000000001e-05, "loss": 1.6182, "step": 134 }, { "epoch": 0.05615786600109196, "grad_norm": 0.5179898738861084, "learning_rate": 1.3400000000000002e-05, "loss": 1.6013, "step": 135 }, { "epoch": 0.05657385019369264, "grad_norm": 0.4217924475669861, "learning_rate": 1.3500000000000001e-05, "loss": 1.6243, "step": 136 }, { "epoch": 0.05698983438629332, "grad_norm": 0.3869641423225403, "learning_rate": 1.3600000000000002e-05, "loss": 1.5429, "step": 137 }, { "epoch": 0.057405818578894, "grad_norm": 0.46726956963539124, "learning_rate": 1.3700000000000003e-05, "loss": 1.4136, "step": 138 }, { "epoch": 0.057821802771494686, "grad_norm": 0.4542416036128998, "learning_rate": 1.38e-05, "loss": 1.5461, "step": 139 }, { "epoch": 0.058237786964095366, "grad_norm": 0.3854866027832031, "learning_rate": 1.39e-05, "loss": 1.4857, "step": 140 }, { "epoch": 0.058653771156696045, "grad_norm": 0.4320104718208313, "learning_rate": 1.4e-05, "loss": 1.4923, "step": 141 }, { "epoch": 0.059069755349296725, "grad_norm": 0.3918810188770294, "learning_rate": 1.41e-05, "loss": 1.4895, "step": 142 }, { "epoch": 0.059485739541897405, "grad_norm": 0.38573160767555237, "learning_rate": 1.4200000000000001e-05, "loss": 1.3981, "step": 143 }, { "epoch": 0.05990172373449809, "grad_norm": 0.4327925443649292, "learning_rate": 1.43e-05, "loss": 1.586, "step": 144 }, { "epoch": 0.06031770792709877, "grad_norm": 0.414200097322464, "learning_rate": 1.4400000000000001e-05, "loss": 1.4367, "step": 145 }, { "epoch": 0.06073369211969945, "grad_norm": 0.4471972584724426, "learning_rate": 1.45e-05, "loss": 1.5331, "step": 146 }, { "epoch": 0.06114967631230013, "grad_norm": 0.7546743154525757, "learning_rate": 1.46e-05, "loss": 1.4173, "step": 147 }, { "epoch": 0.06156566050490082, "grad_norm": 0.39595815539360046, "learning_rate": 1.4700000000000002e-05, "loss": 1.5285, "step": 148 }, { "epoch": 0.061981644697501496, "grad_norm": 0.45172443985939026, "learning_rate": 1.48e-05, "loss": 1.5865, "step": 149 }, { "epoch": 0.062397628890102176, "grad_norm": 0.42620837688446045, "learning_rate": 1.4900000000000001e-05, "loss": 1.4383, "step": 150 }, { "epoch": 0.06281361308270286, "grad_norm": 0.4201160669326782, "learning_rate": 1.5000000000000002e-05, "loss": 1.5924, "step": 151 }, { "epoch": 0.06322959727530354, "grad_norm": 0.4293505549430847, "learning_rate": 1.5100000000000001e-05, "loss": 1.5109, "step": 152 }, { "epoch": 0.06364558146790422, "grad_norm": 0.4352709949016571, "learning_rate": 1.5200000000000002e-05, "loss": 1.4661, "step": 153 }, { "epoch": 0.0640615656605049, "grad_norm": 0.43194663524627686, "learning_rate": 1.5300000000000003e-05, "loss": 1.4689, "step": 154 }, { "epoch": 0.06447754985310558, "grad_norm": 0.42268604040145874, "learning_rate": 1.54e-05, "loss": 1.3916, "step": 155 }, { "epoch": 0.06489353404570626, "grad_norm": 0.4828658401966095, "learning_rate": 1.55e-05, "loss": 1.4757, "step": 156 }, { "epoch": 0.06530951823830694, "grad_norm": 0.5048929452896118, "learning_rate": 1.5600000000000003e-05, "loss": 1.5092, "step": 157 }, { "epoch": 0.06572550243090762, "grad_norm": 0.4853310286998749, "learning_rate": 1.5700000000000002e-05, "loss": 1.4926, "step": 158 }, { "epoch": 0.0661414866235083, "grad_norm": 22.058452606201172, "learning_rate": 1.58e-05, "loss": 1.3469, "step": 159 }, { "epoch": 0.066557470816109, "grad_norm": 0.45353591442108154, "learning_rate": 1.5900000000000004e-05, "loss": 1.4375, "step": 160 }, { "epoch": 0.06697345500870967, "grad_norm": 0.43978020548820496, "learning_rate": 1.6000000000000003e-05, "loss": 1.3753, "step": 161 }, { "epoch": 0.06738943920131035, "grad_norm": 0.46642205119132996, "learning_rate": 1.6100000000000002e-05, "loss": 1.4736, "step": 162 }, { "epoch": 0.06780542339391103, "grad_norm": 0.4381115734577179, "learning_rate": 1.62e-05, "loss": 1.4449, "step": 163 }, { "epoch": 0.06822140758651171, "grad_norm": 0.4396232068538666, "learning_rate": 1.63e-05, "loss": 1.4483, "step": 164 }, { "epoch": 0.06863739177911239, "grad_norm": 0.46247151494026184, "learning_rate": 1.64e-05, "loss": 1.3978, "step": 165 }, { "epoch": 0.06905337597171307, "grad_norm": 0.45181524753570557, "learning_rate": 1.65e-05, "loss": 1.6228, "step": 166 }, { "epoch": 0.06946936016431375, "grad_norm": 0.4493110477924347, "learning_rate": 1.66e-05, "loss": 1.474, "step": 167 }, { "epoch": 0.06988534435691443, "grad_norm": 9.469669342041016, "learning_rate": 1.67e-05, "loss": 1.3988, "step": 168 }, { "epoch": 0.07030132854951512, "grad_norm": 0.4385135769844055, "learning_rate": 1.6800000000000002e-05, "loss": 1.4645, "step": 169 }, { "epoch": 0.0707173127421158, "grad_norm": 0.5094560980796814, "learning_rate": 1.69e-05, "loss": 1.2777, "step": 170 }, { "epoch": 0.07113329693471648, "grad_norm": 0.45567911863327026, "learning_rate": 1.7e-05, "loss": 1.3443, "step": 171 }, { "epoch": 0.07154928112731716, "grad_norm": 0.5907155871391296, "learning_rate": 1.7100000000000002e-05, "loss": 1.4612, "step": 172 }, { "epoch": 0.07196526531991784, "grad_norm": 0.4621376395225525, "learning_rate": 1.72e-05, "loss": 1.4513, "step": 173 }, { "epoch": 0.07238124951251852, "grad_norm": 0.47091981768608093, "learning_rate": 1.73e-05, "loss": 1.4131, "step": 174 }, { "epoch": 0.0727972337051192, "grad_norm": 0.531473696231842, "learning_rate": 1.7400000000000003e-05, "loss": 1.2213, "step": 175 }, { "epoch": 0.07321321789771988, "grad_norm": 450.80120849609375, "learning_rate": 1.7500000000000002e-05, "loss": 1.3906, "step": 176 }, { "epoch": 0.07362920209032056, "grad_norm": 88.7137451171875, "learning_rate": 1.76e-05, "loss": 1.2658, "step": 177 }, { "epoch": 0.07404518628292125, "grad_norm": 0.49480360746383667, "learning_rate": 1.77e-05, "loss": 1.4698, "step": 178 }, { "epoch": 0.07446117047552193, "grad_norm": 0.4701385498046875, "learning_rate": 1.7800000000000002e-05, "loss": 1.2942, "step": 179 }, { "epoch": 0.07487715466812261, "grad_norm": 0.45994943380355835, "learning_rate": 1.79e-05, "loss": 1.3259, "step": 180 }, { "epoch": 0.0752931388607233, "grad_norm": 0.5196590423583984, "learning_rate": 1.8e-05, "loss": 1.2743, "step": 181 }, { "epoch": 0.07570912305332397, "grad_norm": 0.48663389682769775, "learning_rate": 1.8100000000000003e-05, "loss": 1.2996, "step": 182 }, { "epoch": 0.07612510724592465, "grad_norm": 0.5580990314483643, "learning_rate": 1.8200000000000002e-05, "loss": 1.4194, "step": 183 }, { "epoch": 0.07654109143852533, "grad_norm": 0.5013321042060852, "learning_rate": 1.83e-05, "loss": 1.327, "step": 184 }, { "epoch": 0.07695707563112601, "grad_norm": 0.5392055511474609, "learning_rate": 1.8400000000000003e-05, "loss": 1.4335, "step": 185 }, { "epoch": 0.07737305982372669, "grad_norm": 0.6236993074417114, "learning_rate": 1.8500000000000002e-05, "loss": 1.3743, "step": 186 }, { "epoch": 0.07778904401632739, "grad_norm": 0.48424574732780457, "learning_rate": 1.86e-05, "loss": 1.3389, "step": 187 }, { "epoch": 0.07820502820892807, "grad_norm": 0.5568936467170715, "learning_rate": 1.8700000000000004e-05, "loss": 1.1802, "step": 188 }, { "epoch": 0.07862101240152874, "grad_norm": 0.5028245449066162, "learning_rate": 1.88e-05, "loss": 1.3, "step": 189 }, { "epoch": 0.07903699659412942, "grad_norm": 0.5427438616752625, "learning_rate": 1.8900000000000002e-05, "loss": 1.252, "step": 190 }, { "epoch": 0.0794529807867301, "grad_norm": 0.515694260597229, "learning_rate": 1.9e-05, "loss": 1.2695, "step": 191 }, { "epoch": 0.07986896497933078, "grad_norm": 0.5203349590301514, "learning_rate": 1.91e-05, "loss": 1.1892, "step": 192 }, { "epoch": 0.08028494917193146, "grad_norm": 0.5478265285491943, "learning_rate": 1.9200000000000003e-05, "loss": 1.3914, "step": 193 }, { "epoch": 0.08070093336453214, "grad_norm": 0.5809688568115234, "learning_rate": 1.93e-05, "loss": 1.3811, "step": 194 }, { "epoch": 0.08111691755713282, "grad_norm": 0.5369510650634766, "learning_rate": 1.94e-05, "loss": 1.1328, "step": 195 }, { "epoch": 0.08153290174973352, "grad_norm": 0.5076486468315125, "learning_rate": 1.95e-05, "loss": 1.2058, "step": 196 }, { "epoch": 0.0819488859423342, "grad_norm": 0.5416799187660217, "learning_rate": 1.9600000000000002e-05, "loss": 1.3379, "step": 197 }, { "epoch": 0.08236487013493488, "grad_norm": 0.5438718199729919, "learning_rate": 1.97e-05, "loss": 1.4459, "step": 198 }, { "epoch": 0.08278085432753556, "grad_norm": 485.4228210449219, "learning_rate": 1.98e-05, "loss": 1.2395, "step": 199 }, { "epoch": 0.08319683852013623, "grad_norm": 137.8297576904297, "learning_rate": 1.9900000000000003e-05, "loss": 1.2434, "step": 200 }, { "epoch": 0.08361282271273691, "grad_norm": 0.6259170174598694, "learning_rate": 2e-05, "loss": 1.2535, "step": 201 }, { "epoch": 0.0840288069053376, "grad_norm": 0.5395736694335938, "learning_rate": 1.9999998996341592e-05, "loss": 1.2802, "step": 202 }, { "epoch": 0.08444479109793827, "grad_norm": 0.5218931436538696, "learning_rate": 1.9999995985366568e-05, "loss": 1.2499, "step": 203 }, { "epoch": 0.08486077529053895, "grad_norm": 0.5286381244659424, "learning_rate": 1.999999096707553e-05, "loss": 1.2014, "step": 204 }, { "epoch": 0.08527675948313965, "grad_norm": 1.6990370750427246, "learning_rate": 1.9999983941469486e-05, "loss": 1.312, "step": 205 }, { "epoch": 0.08569274367574033, "grad_norm": 0.5225419998168945, "learning_rate": 1.999997490854985e-05, "loss": 1.2986, "step": 206 }, { "epoch": 0.086108727868341, "grad_norm": 0.5020140409469604, "learning_rate": 1.999996386831843e-05, "loss": 1.2712, "step": 207 }, { "epoch": 0.08652471206094169, "grad_norm": 0.541471540927887, "learning_rate": 1.9999950820777448e-05, "loss": 1.2804, "step": 208 }, { "epoch": 0.08694069625354237, "grad_norm": 0.5042675733566284, "learning_rate": 1.999993576592952e-05, "loss": 1.042, "step": 209 }, { "epoch": 0.08735668044614305, "grad_norm": 0.5764191150665283, "learning_rate": 1.9999918703777664e-05, "loss": 1.273, "step": 210 }, { "epoch": 0.08777266463874372, "grad_norm": 0.5510497689247131, "learning_rate": 1.9999899634325314e-05, "loss": 1.2918, "step": 211 }, { "epoch": 0.0881886488313444, "grad_norm": 0.5606387853622437, "learning_rate": 1.9999878557576288e-05, "loss": 1.3317, "step": 212 }, { "epoch": 0.08860463302394508, "grad_norm": 0.54827481508255, "learning_rate": 1.9999855473534825e-05, "loss": 1.2344, "step": 213 }, { "epoch": 0.08902061721654578, "grad_norm": 0.5744351744651794, "learning_rate": 1.9999830382205552e-05, "loss": 1.2196, "step": 214 }, { "epoch": 0.08943660140914646, "grad_norm": 0.5971208214759827, "learning_rate": 1.9999803283593508e-05, "loss": 1.2822, "step": 215 }, { "epoch": 0.08985258560174714, "grad_norm": 0.5652117729187012, "learning_rate": 1.9999774177704134e-05, "loss": 1.2983, "step": 216 }, { "epoch": 0.09026856979434782, "grad_norm": 0.6287267208099365, "learning_rate": 1.9999743064543272e-05, "loss": 1.3288, "step": 217 }, { "epoch": 0.0906845539869485, "grad_norm": 0.5372384190559387, "learning_rate": 1.999970994411717e-05, "loss": 1.2689, "step": 218 }, { "epoch": 0.09110053817954918, "grad_norm": 0.5343045592308044, "learning_rate": 1.999967481643247e-05, "loss": 1.2059, "step": 219 }, { "epoch": 0.09151652237214986, "grad_norm": 0.5887562036514282, "learning_rate": 1.999963768149623e-05, "loss": 1.2917, "step": 220 }, { "epoch": 0.09193250656475054, "grad_norm": 0.5310919284820557, "learning_rate": 1.9999598539315895e-05, "loss": 1.3278, "step": 221 }, { "epoch": 0.09234849075735121, "grad_norm": 0.5903929471969604, "learning_rate": 1.9999557389899335e-05, "loss": 1.1827, "step": 222 }, { "epoch": 0.09276447494995191, "grad_norm": 0.5858891010284424, "learning_rate": 1.99995142332548e-05, "loss": 1.2607, "step": 223 }, { "epoch": 0.09318045914255259, "grad_norm": 0.5915316939353943, "learning_rate": 1.9999469069390954e-05, "loss": 1.2502, "step": 224 }, { "epoch": 0.09359644333515327, "grad_norm": 0.5510343313217163, "learning_rate": 1.9999421898316863e-05, "loss": 1.2111, "step": 225 }, { "epoch": 0.09401242752775395, "grad_norm": 0.5296527147293091, "learning_rate": 1.9999372720042004e-05, "loss": 1.1984, "step": 226 }, { "epoch": 0.09442841172035463, "grad_norm": 0.5727095007896423, "learning_rate": 1.9999321534576236e-05, "loss": 1.2901, "step": 227 }, { "epoch": 0.0948443959129553, "grad_norm": 0.5319727063179016, "learning_rate": 1.9999268341929843e-05, "loss": 1.2938, "step": 228 }, { "epoch": 0.09526038010555599, "grad_norm": 0.5615089535713196, "learning_rate": 1.9999213142113496e-05, "loss": 1.2865, "step": 229 }, { "epoch": 0.09567636429815667, "grad_norm": 0.5923480987548828, "learning_rate": 1.9999155935138282e-05, "loss": 1.3313, "step": 230 }, { "epoch": 0.09609234849075735, "grad_norm": 0.6562802195549011, "learning_rate": 1.9999096721015677e-05, "loss": 1.374, "step": 231 }, { "epoch": 0.09650833268335804, "grad_norm": 0.6394391655921936, "learning_rate": 1.9999035499757572e-05, "loss": 1.2893, "step": 232 }, { "epoch": 0.09692431687595872, "grad_norm": 0.5934363603591919, "learning_rate": 1.9998972271376257e-05, "loss": 1.1951, "step": 233 }, { "epoch": 0.0973403010685594, "grad_norm": 10.114876747131348, "learning_rate": 1.999890703588442e-05, "loss": 1.2511, "step": 234 }, { "epoch": 0.09775628526116008, "grad_norm": 0.6414576768875122, "learning_rate": 1.9998839793295157e-05, "loss": 1.2124, "step": 235 }, { "epoch": 0.09817226945376076, "grad_norm": 0.6024904847145081, "learning_rate": 1.9998770543621968e-05, "loss": 1.3152, "step": 236 }, { "epoch": 0.09858825364636144, "grad_norm": 0.6469990015029907, "learning_rate": 1.999869928687875e-05, "loss": 1.2608, "step": 237 }, { "epoch": 0.09900423783896212, "grad_norm": 0.6154808402061462, "learning_rate": 1.9998626023079808e-05, "loss": 1.3196, "step": 238 }, { "epoch": 0.0994202220315628, "grad_norm": 0.6197603940963745, "learning_rate": 1.999855075223985e-05, "loss": 1.2626, "step": 239 }, { "epoch": 0.09983620622416348, "grad_norm": 0.5699375867843628, "learning_rate": 1.9998473474373983e-05, "loss": 1.0801, "step": 240 }, { "epoch": 0.10025219041676417, "grad_norm": 0.7205070853233337, "learning_rate": 1.999839418949772e-05, "loss": 1.3872, "step": 241 }, { "epoch": 0.10066817460936485, "grad_norm": 0.6620679497718811, "learning_rate": 1.9998312897626978e-05, "loss": 1.1433, "step": 242 }, { "epoch": 0.10108415880196553, "grad_norm": 0.5992295145988464, "learning_rate": 1.999822959877807e-05, "loss": 1.2741, "step": 243 }, { "epoch": 0.10150014299456621, "grad_norm": 0.6469533443450928, "learning_rate": 1.9998144292967722e-05, "loss": 1.0936, "step": 244 }, { "epoch": 0.10191612718716689, "grad_norm": 0.6054693460464478, "learning_rate": 1.9998056980213056e-05, "loss": 1.3786, "step": 245 }, { "epoch": 0.10233211137976757, "grad_norm": 0.6187664866447449, "learning_rate": 1.9997967660531597e-05, "loss": 1.2059, "step": 246 }, { "epoch": 0.10274809557236825, "grad_norm": 0.6184559464454651, "learning_rate": 1.999787633394127e-05, "loss": 1.2905, "step": 247 }, { "epoch": 0.10316407976496893, "grad_norm": 0.6825792193412781, "learning_rate": 1.9997783000460417e-05, "loss": 1.2434, "step": 248 }, { "epoch": 0.1035800639575696, "grad_norm": 0.6475814580917358, "learning_rate": 1.9997687660107766e-05, "loss": 1.2482, "step": 249 }, { "epoch": 0.1039960481501703, "grad_norm": 0.6080524325370789, "learning_rate": 1.999759031290246e-05, "loss": 1.1496, "step": 250 }, { "epoch": 0.10441203234277098, "grad_norm": 0.6287045478820801, "learning_rate": 1.9997490958864033e-05, "loss": 1.1831, "step": 251 }, { "epoch": 0.10482801653537166, "grad_norm": 0.7070287466049194, "learning_rate": 1.9997389598012436e-05, "loss": 1.2558, "step": 252 }, { "epoch": 0.10524400072797234, "grad_norm": 0.6424829959869385, "learning_rate": 1.999728623036801e-05, "loss": 1.1584, "step": 253 }, { "epoch": 0.10565998492057302, "grad_norm": 0.6233189105987549, "learning_rate": 1.9997180855951504e-05, "loss": 1.2234, "step": 254 }, { "epoch": 0.1060759691131737, "grad_norm": 0.5878159999847412, "learning_rate": 1.9997073474784073e-05, "loss": 1.035, "step": 255 }, { "epoch": 0.10649195330577438, "grad_norm": 0.5974534749984741, "learning_rate": 1.9996964086887268e-05, "loss": 1.1972, "step": 256 }, { "epoch": 0.10690793749837506, "grad_norm": 0.6987691521644592, "learning_rate": 1.999685269228305e-05, "loss": 1.1321, "step": 257 }, { "epoch": 0.10732392169097574, "grad_norm": 0.7622812986373901, "learning_rate": 1.999673929099378e-05, "loss": 1.2611, "step": 258 }, { "epoch": 0.10773990588357642, "grad_norm": 0.6739822030067444, "learning_rate": 1.999662388304222e-05, "loss": 1.1504, "step": 259 }, { "epoch": 0.10815589007617711, "grad_norm": 0.689831554889679, "learning_rate": 1.9996506468451535e-05, "loss": 1.3266, "step": 260 }, { "epoch": 0.10857187426877779, "grad_norm": 0.6558260917663574, "learning_rate": 1.999638704724529e-05, "loss": 1.235, "step": 261 }, { "epoch": 0.10898785846137847, "grad_norm": 0.6770922541618347, "learning_rate": 1.9996265619447467e-05, "loss": 1.2669, "step": 262 }, { "epoch": 0.10940384265397915, "grad_norm": 0.6816359758377075, "learning_rate": 1.9996142185082433e-05, "loss": 1.2288, "step": 263 }, { "epoch": 0.10981982684657983, "grad_norm": 0.6756231188774109, "learning_rate": 1.9996016744174965e-05, "loss": 1.1167, "step": 264 }, { "epoch": 0.11023581103918051, "grad_norm": 0.6750141978263855, "learning_rate": 1.9995889296750247e-05, "loss": 1.0958, "step": 265 }, { "epoch": 0.11065179523178119, "grad_norm": 0.6281485557556152, "learning_rate": 1.9995759842833856e-05, "loss": 1.2172, "step": 266 }, { "epoch": 0.11106777942438187, "grad_norm": 0.6432274580001831, "learning_rate": 1.9995628382451783e-05, "loss": 1.1059, "step": 267 }, { "epoch": 0.11148376361698255, "grad_norm": 0.6329265236854553, "learning_rate": 1.9995494915630414e-05, "loss": 1.1851, "step": 268 }, { "epoch": 0.11189974780958324, "grad_norm": 0.6731418371200562, "learning_rate": 1.9995359442396543e-05, "loss": 1.1948, "step": 269 }, { "epoch": 0.11231573200218392, "grad_norm": 0.6863015294075012, "learning_rate": 1.999522196277736e-05, "loss": 1.1394, "step": 270 }, { "epoch": 0.1127317161947846, "grad_norm": 0.6922223567962646, "learning_rate": 1.999508247680046e-05, "loss": 1.2992, "step": 271 }, { "epoch": 0.11314770038738528, "grad_norm": 0.6781586408615112, "learning_rate": 1.999494098449384e-05, "loss": 1.1905, "step": 272 }, { "epoch": 0.11356368457998596, "grad_norm": 0.7006034851074219, "learning_rate": 1.9994797485885917e-05, "loss": 1.1855, "step": 273 }, { "epoch": 0.11397966877258664, "grad_norm": 2.7518441677093506, "learning_rate": 1.9994651981005478e-05, "loss": 1.1236, "step": 274 }, { "epoch": 0.11439565296518732, "grad_norm": 0.7251802682876587, "learning_rate": 1.999450446988174e-05, "loss": 1.0456, "step": 275 }, { "epoch": 0.114811637157788, "grad_norm": 0.7490171790122986, "learning_rate": 1.9994354952544315e-05, "loss": 1.2134, "step": 276 }, { "epoch": 0.11522762135038868, "grad_norm": 0.7208209037780762, "learning_rate": 1.9994203429023208e-05, "loss": 1.243, "step": 277 }, { "epoch": 0.11564360554298937, "grad_norm": 0.7621865272521973, "learning_rate": 1.999404989934884e-05, "loss": 1.1885, "step": 278 }, { "epoch": 0.11605958973559005, "grad_norm": 0.6657444834709167, "learning_rate": 1.9993894363552025e-05, "loss": 1.1831, "step": 279 }, { "epoch": 0.11647557392819073, "grad_norm": 0.6804199814796448, "learning_rate": 1.999373682166399e-05, "loss": 1.1325, "step": 280 }, { "epoch": 0.11689155812079141, "grad_norm": 0.678875744342804, "learning_rate": 1.999357727371635e-05, "loss": 1.0534, "step": 281 }, { "epoch": 0.11730754231339209, "grad_norm": 38.468109130859375, "learning_rate": 1.9993415719741143e-05, "loss": 1.1408, "step": 282 }, { "epoch": 0.11772352650599277, "grad_norm": 0.7126438617706299, "learning_rate": 1.999325215977079e-05, "loss": 1.0866, "step": 283 }, { "epoch": 0.11813951069859345, "grad_norm": 0.7753421068191528, "learning_rate": 1.999308659383812e-05, "loss": 1.1861, "step": 284 }, { "epoch": 0.11855549489119413, "grad_norm": 0.7278493046760559, "learning_rate": 1.9992919021976376e-05, "loss": 1.1121, "step": 285 }, { "epoch": 0.11897147908379481, "grad_norm": 0.7159318923950195, "learning_rate": 1.9992749444219192e-05, "loss": 1.124, "step": 286 }, { "epoch": 0.1193874632763955, "grad_norm": 0.7180936932563782, "learning_rate": 1.9992577860600605e-05, "loss": 1.1905, "step": 287 }, { "epoch": 0.11980344746899618, "grad_norm": 0.7221747636795044, "learning_rate": 1.9992404271155056e-05, "loss": 1.1149, "step": 288 }, { "epoch": 0.12021943166159686, "grad_norm": 0.740716814994812, "learning_rate": 1.9992228675917392e-05, "loss": 1.1971, "step": 289 }, { "epoch": 0.12063541585419754, "grad_norm": 0.6795661449432373, "learning_rate": 1.9992051074922863e-05, "loss": 1.0671, "step": 290 }, { "epoch": 0.12105140004679822, "grad_norm": 0.7694556713104248, "learning_rate": 1.999187146820712e-05, "loss": 1.3945, "step": 291 }, { "epoch": 0.1214673842393989, "grad_norm": 0.7540693879127502, "learning_rate": 1.999168985580621e-05, "loss": 1.206, "step": 292 }, { "epoch": 0.12188336843199958, "grad_norm": 47.97311782836914, "learning_rate": 1.999150623775659e-05, "loss": 1.1528, "step": 293 }, { "epoch": 0.12229935262460026, "grad_norm": 0.8319066166877747, "learning_rate": 1.999132061409512e-05, "loss": 1.2102, "step": 294 }, { "epoch": 0.12271533681720094, "grad_norm": 0.7492083311080933, "learning_rate": 1.999113298485906e-05, "loss": 1.1177, "step": 295 }, { "epoch": 0.12313132100980163, "grad_norm": 137219.125, "learning_rate": 1.9990943350086072e-05, "loss": 1.1844, "step": 296 }, { "epoch": 0.12354730520240231, "grad_norm": 0.7345067262649536, "learning_rate": 1.9990751709814224e-05, "loss": 1.2386, "step": 297 }, { "epoch": 0.12396328939500299, "grad_norm": 0.7450856566429138, "learning_rate": 1.9990558064081983e-05, "loss": 1.1519, "step": 298 }, { "epoch": 0.12437927358760367, "grad_norm": 0.7386280298233032, "learning_rate": 1.999036241292822e-05, "loss": 1.1391, "step": 299 }, { "epoch": 0.12479525778020435, "grad_norm": 0.8047730922698975, "learning_rate": 1.9990164756392208e-05, "loss": 1.1298, "step": 300 }, { "epoch": 0.12521124197280503, "grad_norm": 0.781829833984375, "learning_rate": 1.998996509451362e-05, "loss": 1.2656, "step": 301 }, { "epoch": 0.12562722616540573, "grad_norm": 0.7816736102104187, "learning_rate": 1.9989763427332542e-05, "loss": 1.2334, "step": 302 }, { "epoch": 0.1260432103580064, "grad_norm": 11.881006240844727, "learning_rate": 1.9989559754889448e-05, "loss": 1.0342, "step": 303 }, { "epoch": 0.12645919455060708, "grad_norm": 0.8100154995918274, "learning_rate": 1.9989354077225223e-05, "loss": 1.2823, "step": 304 }, { "epoch": 0.12687517874320775, "grad_norm": 0.8673021197319031, "learning_rate": 1.9989146394381156e-05, "loss": 1.3053, "step": 305 }, { "epoch": 0.12729116293580844, "grad_norm": 0.7185091972351074, "learning_rate": 1.998893670639893e-05, "loss": 1.1366, "step": 306 }, { "epoch": 0.1277071471284091, "grad_norm": 0.7425987124443054, "learning_rate": 1.9988725013320646e-05, "loss": 1.0732, "step": 307 }, { "epoch": 0.1281231313210098, "grad_norm": 0.7009145617485046, "learning_rate": 1.9988511315188786e-05, "loss": 1.1052, "step": 308 }, { "epoch": 0.12853911551361047, "grad_norm": 0.760377049446106, "learning_rate": 1.9988295612046254e-05, "loss": 1.24, "step": 309 }, { "epoch": 0.12895509970621116, "grad_norm": 0.7180002927780151, "learning_rate": 1.998807790393634e-05, "loss": 1.2182, "step": 310 }, { "epoch": 0.12937108389881186, "grad_norm": 0.7163686156272888, "learning_rate": 1.9987858190902755e-05, "loss": 1.1377, "step": 311 }, { "epoch": 0.12978706809141252, "grad_norm": 0.7937809228897095, "learning_rate": 1.9987636472989598e-05, "loss": 1.1082, "step": 312 }, { "epoch": 0.13020305228401322, "grad_norm": 0.772754967212677, "learning_rate": 1.998741275024137e-05, "loss": 1.0397, "step": 313 }, { "epoch": 0.13061903647661388, "grad_norm": 0.7635824084281921, "learning_rate": 1.9987187022702992e-05, "loss": 1.2718, "step": 314 }, { "epoch": 0.13103502066921457, "grad_norm": 0.7451383471488953, "learning_rate": 1.9986959290419762e-05, "loss": 1.1326, "step": 315 }, { "epoch": 0.13145100486181524, "grad_norm": 0.8176464438438416, "learning_rate": 1.9986729553437398e-05, "loss": 1.2129, "step": 316 }, { "epoch": 0.13186698905441593, "grad_norm": 0.709227979183197, "learning_rate": 1.9986497811802015e-05, "loss": 1.1552, "step": 317 }, { "epoch": 0.1322829732470166, "grad_norm": 0.7636451125144958, "learning_rate": 1.9986264065560127e-05, "loss": 1.0408, "step": 318 }, { "epoch": 0.1326989574396173, "grad_norm": 0.7426299452781677, "learning_rate": 1.9986028314758664e-05, "loss": 1.1316, "step": 319 }, { "epoch": 0.133114941632218, "grad_norm": 602.46923828125, "learning_rate": 1.998579055944494e-05, "loss": 1.1928, "step": 320 }, { "epoch": 0.13353092582481865, "grad_norm": 0.8203775882720947, "learning_rate": 1.9985550799666686e-05, "loss": 1.1511, "step": 321 }, { "epoch": 0.13394691001741935, "grad_norm": 0.774730384349823, "learning_rate": 1.9985309035472023e-05, "loss": 1.0706, "step": 322 }, { "epoch": 0.13436289421002, "grad_norm": 0.7340565323829651, "learning_rate": 1.9985065266909484e-05, "loss": 1.1476, "step": 323 }, { "epoch": 0.1347788784026207, "grad_norm": 0.7326402068138123, "learning_rate": 1.9984819494028007e-05, "loss": 1.0241, "step": 324 }, { "epoch": 0.13519486259522137, "grad_norm": 0.825442910194397, "learning_rate": 1.998457171687691e-05, "loss": 1.1401, "step": 325 }, { "epoch": 0.13561084678782206, "grad_norm": 0.8084119558334351, "learning_rate": 1.9984321935505947e-05, "loss": 1.2236, "step": 326 }, { "epoch": 0.13602683098042273, "grad_norm": 0.8279935717582703, "learning_rate": 1.998407014996525e-05, "loss": 1.1377, "step": 327 }, { "epoch": 0.13644281517302342, "grad_norm": 0.791469156742096, "learning_rate": 1.9983816360305363e-05, "loss": 1.2748, "step": 328 }, { "epoch": 0.13685879936562412, "grad_norm": 129.91099548339844, "learning_rate": 1.998356056657722e-05, "loss": 1.104, "step": 329 }, { "epoch": 0.13727478355822478, "grad_norm": 0.7390044927597046, "learning_rate": 1.998330276883218e-05, "loss": 1.0717, "step": 330 }, { "epoch": 0.13769076775082548, "grad_norm": 0.7921305894851685, "learning_rate": 1.998304296712198e-05, "loss": 1.2155, "step": 331 }, { "epoch": 0.13810675194342614, "grad_norm": 0.8290470242500305, "learning_rate": 1.9982781161498784e-05, "loss": 1.3327, "step": 332 }, { "epoch": 0.13852273613602684, "grad_norm": 0.8409906625747681, "learning_rate": 1.998251735201513e-05, "loss": 1.2373, "step": 333 }, { "epoch": 0.1389387203286275, "grad_norm": 0.8277384638786316, "learning_rate": 1.9982251538723982e-05, "loss": 1.1384, "step": 334 }, { "epoch": 0.1393547045212282, "grad_norm": 0.8078048229217529, "learning_rate": 1.998198372167869e-05, "loss": 1.0829, "step": 335 }, { "epoch": 0.13977068871382886, "grad_norm": 0.730109453201294, "learning_rate": 1.9981713900933023e-05, "loss": 1.0461, "step": 336 }, { "epoch": 0.14018667290642955, "grad_norm": 0.8404555916786194, "learning_rate": 1.9981442076541136e-05, "loss": 1.1693, "step": 337 }, { "epoch": 0.14060265709903025, "grad_norm": 0.769023060798645, "learning_rate": 1.9981168248557594e-05, "loss": 1.0316, "step": 338 }, { "epoch": 0.14101864129163091, "grad_norm": 0.9137089848518372, "learning_rate": 1.9980892417037365e-05, "loss": 1.1891, "step": 339 }, { "epoch": 0.1414346254842316, "grad_norm": 0.8112242817878723, "learning_rate": 1.9980614582035815e-05, "loss": 1.0513, "step": 340 }, { "epoch": 0.14185060967683227, "grad_norm": 0.8220342397689819, "learning_rate": 1.9980334743608712e-05, "loss": 1.1471, "step": 341 }, { "epoch": 0.14226659386943297, "grad_norm": 0.7599901556968689, "learning_rate": 1.9980052901812232e-05, "loss": 1.1554, "step": 342 }, { "epoch": 0.14268257806203363, "grad_norm": 0.8765674829483032, "learning_rate": 1.997976905670295e-05, "loss": 1.122, "step": 343 }, { "epoch": 0.14309856225463433, "grad_norm": 0.8653687834739685, "learning_rate": 1.9979483208337837e-05, "loss": 1.1236, "step": 344 }, { "epoch": 0.143514546447235, "grad_norm": 0.7801381945610046, "learning_rate": 1.997919535677428e-05, "loss": 1.0911, "step": 345 }, { "epoch": 0.14393053063983569, "grad_norm": 0.7961997985839844, "learning_rate": 1.9978905502070055e-05, "loss": 1.1153, "step": 346 }, { "epoch": 0.14434651483243638, "grad_norm": 0.8459645509719849, "learning_rate": 1.9978613644283347e-05, "loss": 1.0602, "step": 347 }, { "epoch": 0.14476249902503704, "grad_norm": 0.7791506052017212, "learning_rate": 1.997831978347274e-05, "loss": 1.0314, "step": 348 }, { "epoch": 0.14517848321763774, "grad_norm": 0.9307152628898621, "learning_rate": 1.9978023919697217e-05, "loss": 1.3119, "step": 349 }, { "epoch": 0.1455944674102384, "grad_norm": 0.8029118776321411, "learning_rate": 1.9977726053016176e-05, "loss": 1.1189, "step": 350 }, { "epoch": 0.1460104516028391, "grad_norm": 0.828848659992218, "learning_rate": 1.9977426183489407e-05, "loss": 1.1316, "step": 351 }, { "epoch": 0.14642643579543976, "grad_norm": 0.7946081161499023, "learning_rate": 1.9977124311177097e-05, "loss": 1.1503, "step": 352 }, { "epoch": 0.14684241998804046, "grad_norm": 0.840021014213562, "learning_rate": 1.9976820436139843e-05, "loss": 1.1404, "step": 353 }, { "epoch": 0.14725840418064112, "grad_norm": 0.8943194150924683, "learning_rate": 1.9976514558438643e-05, "loss": 1.0209, "step": 354 }, { "epoch": 0.14767438837324182, "grad_norm": 0.7787627577781677, "learning_rate": 1.99762066781349e-05, "loss": 1.0481, "step": 355 }, { "epoch": 0.1480903725658425, "grad_norm": 0.9671856164932251, "learning_rate": 1.997589679529041e-05, "loss": 1.2205, "step": 356 }, { "epoch": 0.14850635675844318, "grad_norm": 2.8111603260040283, "learning_rate": 1.9975584909967382e-05, "loss": 1.0767, "step": 357 }, { "epoch": 0.14892234095104387, "grad_norm": 0.7566583156585693, "learning_rate": 1.9975271022228414e-05, "loss": 1.0805, "step": 358 }, { "epoch": 0.14933832514364453, "grad_norm": 0.7571346163749695, "learning_rate": 1.997495513213652e-05, "loss": 1.1062, "step": 359 }, { "epoch": 0.14975430933624523, "grad_norm": 19.755220413208008, "learning_rate": 1.9974637239755106e-05, "loss": 1.1198, "step": 360 }, { "epoch": 0.1501702935288459, "grad_norm": 0.8500381112098694, "learning_rate": 1.9974317345147985e-05, "loss": 1.1596, "step": 361 }, { "epoch": 0.1505862777214466, "grad_norm": 0.8949056267738342, "learning_rate": 1.997399544837937e-05, "loss": 1.1485, "step": 362 }, { "epoch": 0.15100226191404725, "grad_norm": 0.8218891024589539, "learning_rate": 1.997367154951387e-05, "loss": 1.1414, "step": 363 }, { "epoch": 0.15141824610664795, "grad_norm": 0.7790285348892212, "learning_rate": 1.997334564861651e-05, "loss": 1.1221, "step": 364 }, { "epoch": 0.15183423029924864, "grad_norm": 0.8069561719894409, "learning_rate": 1.99730177457527e-05, "loss": 1.1051, "step": 365 }, { "epoch": 0.1522502144918493, "grad_norm": 0.8034617900848389, "learning_rate": 1.9972687840988267e-05, "loss": 1.1807, "step": 366 }, { "epoch": 0.15266619868445, "grad_norm": 0.8357846736907959, "learning_rate": 1.9972355934389433e-05, "loss": 1.1158, "step": 367 }, { "epoch": 0.15308218287705067, "grad_norm": 0.8330258727073669, "learning_rate": 1.9972022026022822e-05, "loss": 1.0199, "step": 368 }, { "epoch": 0.15349816706965136, "grad_norm": 0.8245261311531067, "learning_rate": 1.997168611595546e-05, "loss": 1.1307, "step": 369 }, { "epoch": 0.15391415126225202, "grad_norm": 0.7980736494064331, "learning_rate": 1.9971348204254766e-05, "loss": 1.0059, "step": 370 }, { "epoch": 0.15433013545485272, "grad_norm": 0.8993247747421265, "learning_rate": 1.997100829098858e-05, "loss": 1.0707, "step": 371 }, { "epoch": 0.15474611964745338, "grad_norm": 0.8719295859336853, "learning_rate": 1.9970666376225136e-05, "loss": 1.0862, "step": 372 }, { "epoch": 0.15516210384005408, "grad_norm": 0.7660198211669922, "learning_rate": 1.9970322460033058e-05, "loss": 1.0507, "step": 373 }, { "epoch": 0.15557808803265477, "grad_norm": 0.8158249855041504, "learning_rate": 1.9969976542481385e-05, "loss": 1.0553, "step": 374 }, { "epoch": 0.15599407222525544, "grad_norm": 0.8492237329483032, "learning_rate": 1.9969628623639553e-05, "loss": 1.1833, "step": 375 }, { "epoch": 0.15641005641785613, "grad_norm": 0.8411293625831604, "learning_rate": 1.99692787035774e-05, "loss": 1.0726, "step": 376 }, { "epoch": 0.1568260406104568, "grad_norm": 0.8067759871482849, "learning_rate": 1.996892678236517e-05, "loss": 1.2142, "step": 377 }, { "epoch": 0.1572420248030575, "grad_norm": 0.8767145872116089, "learning_rate": 1.9968572860073494e-05, "loss": 1.0515, "step": 378 }, { "epoch": 0.15765800899565816, "grad_norm": 0.8479776978492737, "learning_rate": 1.9968216936773425e-05, "loss": 1.2088, "step": 379 }, { "epoch": 0.15807399318825885, "grad_norm": 0.8671485781669617, "learning_rate": 1.996785901253641e-05, "loss": 1.181, "step": 380 }, { "epoch": 0.15848997738085951, "grad_norm": 0.8266106247901917, "learning_rate": 1.996749908743429e-05, "loss": 1.1745, "step": 381 }, { "epoch": 0.1589059615734602, "grad_norm": 0.9021766185760498, "learning_rate": 1.9967137161539312e-05, "loss": 1.0443, "step": 382 }, { "epoch": 0.1593219457660609, "grad_norm": 0.8564395904541016, "learning_rate": 1.996677323492413e-05, "loss": 1.2054, "step": 383 }, { "epoch": 0.15973792995866157, "grad_norm": 0.9116708636283875, "learning_rate": 1.9966407307661796e-05, "loss": 1.1198, "step": 384 }, { "epoch": 0.16015391415126226, "grad_norm": 67.18666076660156, "learning_rate": 1.996603937982576e-05, "loss": 1.0617, "step": 385 }, { "epoch": 0.16056989834386293, "grad_norm": 0.8681408166885376, "learning_rate": 1.9965669451489882e-05, "loss": 1.1673, "step": 386 }, { "epoch": 0.16098588253646362, "grad_norm": 0.8447865843772888, "learning_rate": 1.996529752272841e-05, "loss": 1.0179, "step": 387 }, { "epoch": 0.16140186672906429, "grad_norm": 0.8744534850120544, "learning_rate": 1.996492359361601e-05, "loss": 1.0656, "step": 388 }, { "epoch": 0.16181785092166498, "grad_norm": 0.8761427998542786, "learning_rate": 1.9964547664227737e-05, "loss": 1.1053, "step": 389 }, { "epoch": 0.16223383511426565, "grad_norm": 0.9251875281333923, "learning_rate": 1.996416973463905e-05, "loss": 1.0894, "step": 390 }, { "epoch": 0.16264981930686634, "grad_norm": 3.763282060623169, "learning_rate": 1.9963789804925818e-05, "loss": 1.0765, "step": 391 }, { "epoch": 0.16306580349946703, "grad_norm": 0.9673831462860107, "learning_rate": 1.99634078751643e-05, "loss": 1.1891, "step": 392 }, { "epoch": 0.1634817876920677, "grad_norm": 0.9208243489265442, "learning_rate": 1.9963023945431167e-05, "loss": 1.063, "step": 393 }, { "epoch": 0.1638977718846684, "grad_norm": 0.8496806621551514, "learning_rate": 1.9962638015803475e-05, "loss": 1.0202, "step": 394 }, { "epoch": 0.16431375607726906, "grad_norm": 0.9675566554069519, "learning_rate": 1.9962250086358702e-05, "loss": 1.1583, "step": 395 }, { "epoch": 0.16472974026986975, "grad_norm": 0.8452079892158508, "learning_rate": 1.9961860157174715e-05, "loss": 1.1355, "step": 396 }, { "epoch": 0.16514572446247042, "grad_norm": 0.8956078886985779, "learning_rate": 1.9961468228329784e-05, "loss": 1.1059, "step": 397 }, { "epoch": 0.1655617086550711, "grad_norm": 0.9005361795425415, "learning_rate": 1.9961074299902582e-05, "loss": 1.1916, "step": 398 }, { "epoch": 0.16597769284767178, "grad_norm": 0.8705242276191711, "learning_rate": 1.9960678371972184e-05, "loss": 1.0004, "step": 399 }, { "epoch": 0.16639367704027247, "grad_norm": 0.951647937297821, "learning_rate": 1.9960280444618065e-05, "loss": 1.0792, "step": 400 }, { "epoch": 0.16680966123287316, "grad_norm": 0.9446443915367126, "learning_rate": 1.9959880517920102e-05, "loss": 1.042, "step": 401 }, { "epoch": 0.16722564542547383, "grad_norm": 0.890379011631012, "learning_rate": 1.995947859195857e-05, "loss": 1.0787, "step": 402 }, { "epoch": 0.16764162961807452, "grad_norm": 0.9147363901138306, "learning_rate": 1.9959074666814154e-05, "loss": 1.042, "step": 403 }, { "epoch": 0.1680576138106752, "grad_norm": 0.999471127986908, "learning_rate": 1.9958668742567925e-05, "loss": 1.0869, "step": 404 }, { "epoch": 0.16847359800327588, "grad_norm": 0.8799265027046204, "learning_rate": 1.9958260819301375e-05, "loss": 0.974, "step": 405 }, { "epoch": 0.16888958219587655, "grad_norm": 0.8619824647903442, "learning_rate": 1.9957850897096383e-05, "loss": 1.0303, "step": 406 }, { "epoch": 0.16930556638847724, "grad_norm": 0.8960070610046387, "learning_rate": 1.9957438976035232e-05, "loss": 1.2049, "step": 407 }, { "epoch": 0.1697215505810779, "grad_norm": 23.367433547973633, "learning_rate": 1.9957025056200606e-05, "loss": 1.0327, "step": 408 }, { "epoch": 0.1701375347736786, "grad_norm": 0.9393708109855652, "learning_rate": 1.99566091376756e-05, "loss": 1.0155, "step": 409 }, { "epoch": 0.1705535189662793, "grad_norm": 0.9946883320808411, "learning_rate": 1.9956191220543696e-05, "loss": 1.1571, "step": 410 }, { "epoch": 0.17096950315887996, "grad_norm": 0.8686994910240173, "learning_rate": 1.995577130488878e-05, "loss": 1.178, "step": 411 }, { "epoch": 0.17138548735148065, "grad_norm": 0.9101563692092896, "learning_rate": 1.9955349390795153e-05, "loss": 1.1553, "step": 412 }, { "epoch": 0.17180147154408132, "grad_norm": 0.9591279029846191, "learning_rate": 1.9954925478347494e-05, "loss": 1.1673, "step": 413 }, { "epoch": 0.172217455736682, "grad_norm": 0.9458246827125549, "learning_rate": 1.9954499567630904e-05, "loss": 1.0937, "step": 414 }, { "epoch": 0.17263343992928268, "grad_norm": 0.9132758975028992, "learning_rate": 1.9954071658730875e-05, "loss": 1.1374, "step": 415 }, { "epoch": 0.17304942412188337, "grad_norm": 0.871336817741394, "learning_rate": 1.99536417517333e-05, "loss": 0.9618, "step": 416 }, { "epoch": 0.17346540831448404, "grad_norm": 0.9626541137695312, "learning_rate": 1.9953209846724477e-05, "loss": 1.0914, "step": 417 }, { "epoch": 0.17388139250708473, "grad_norm": 1.0190753936767578, "learning_rate": 1.9952775943791104e-05, "loss": 1.1741, "step": 418 }, { "epoch": 0.17429737669968542, "grad_norm": 0.9116002321243286, "learning_rate": 1.9952340043020273e-05, "loss": 1.1375, "step": 419 }, { "epoch": 0.1747133608922861, "grad_norm": 0.8081496953964233, "learning_rate": 1.9951902144499487e-05, "loss": 1.0862, "step": 420 }, { "epoch": 0.17512934508488678, "grad_norm": 0.9783990979194641, "learning_rate": 1.995146224831665e-05, "loss": 1.1324, "step": 421 }, { "epoch": 0.17554532927748745, "grad_norm": 1.061558723449707, "learning_rate": 1.995102035456006e-05, "loss": 1.0575, "step": 422 }, { "epoch": 0.17596131347008814, "grad_norm": 0.8990135192871094, "learning_rate": 1.9950576463318416e-05, "loss": 1.143, "step": 423 }, { "epoch": 0.1763772976626888, "grad_norm": 0.8926308751106262, "learning_rate": 1.9950130574680822e-05, "loss": 1.2086, "step": 424 }, { "epoch": 0.1767932818552895, "grad_norm": 0.8875197768211365, "learning_rate": 1.9949682688736788e-05, "loss": 0.9969, "step": 425 }, { "epoch": 0.17720926604789017, "grad_norm": 0.8743798732757568, "learning_rate": 1.9949232805576213e-05, "loss": 1.1389, "step": 426 }, { "epoch": 0.17762525024049086, "grad_norm": 0.8724097609519958, "learning_rate": 1.9948780925289403e-05, "loss": 1.2192, "step": 427 }, { "epoch": 0.17804123443309156, "grad_norm": 0.9909781217575073, "learning_rate": 1.9948327047967066e-05, "loss": 1.1571, "step": 428 }, { "epoch": 0.17845721862569222, "grad_norm": 0.9148898720741272, "learning_rate": 1.994787117370031e-05, "loss": 1.1199, "step": 429 }, { "epoch": 0.17887320281829291, "grad_norm": 0.9072756767272949, "learning_rate": 1.9947413302580644e-05, "loss": 1.0281, "step": 430 }, { "epoch": 0.17928918701089358, "grad_norm": 1.0467748641967773, "learning_rate": 1.9946953434699977e-05, "loss": 1.3202, "step": 431 }, { "epoch": 0.17970517120349427, "grad_norm": 0.9875684380531311, "learning_rate": 1.9946491570150618e-05, "loss": 1.1273, "step": 432 }, { "epoch": 0.18012115539609494, "grad_norm": 0.9448107481002808, "learning_rate": 1.994602770902528e-05, "loss": 1.131, "step": 433 }, { "epoch": 0.18053713958869563, "grad_norm": 0.9337133765220642, "learning_rate": 1.994556185141707e-05, "loss": 1.012, "step": 434 }, { "epoch": 0.1809531237812963, "grad_norm": 0.9700999855995178, "learning_rate": 1.99450939974195e-05, "loss": 1.1177, "step": 435 }, { "epoch": 0.181369107973897, "grad_norm": 0.9828324913978577, "learning_rate": 1.994462414712649e-05, "loss": 1.2612, "step": 436 }, { "epoch": 0.18178509216649769, "grad_norm": 0.9573013186454773, "learning_rate": 1.994415230063235e-05, "loss": 0.9794, "step": 437 }, { "epoch": 0.18220107635909835, "grad_norm": 0.8753162622451782, "learning_rate": 1.9943678458031796e-05, "loss": 1.1094, "step": 438 }, { "epoch": 0.18261706055169905, "grad_norm": 1.0988367795944214, "learning_rate": 1.9943202619419935e-05, "loss": 1.0113, "step": 439 }, { "epoch": 0.1830330447442997, "grad_norm": 0.9037076830863953, "learning_rate": 1.9942724784892296e-05, "loss": 1.0333, "step": 440 }, { "epoch": 0.1834490289369004, "grad_norm": 0.9828957915306091, "learning_rate": 1.994224495454479e-05, "loss": 1.0846, "step": 441 }, { "epoch": 0.18386501312950107, "grad_norm": 1.0369386672973633, "learning_rate": 1.994176312847373e-05, "loss": 1.1145, "step": 442 }, { "epoch": 0.18428099732210176, "grad_norm": 0.9329926371574402, "learning_rate": 1.994127930677584e-05, "loss": 1.1258, "step": 443 }, { "epoch": 0.18469698151470243, "grad_norm": 0.9606976509094238, "learning_rate": 1.9940793489548236e-05, "loss": 1.0866, "step": 444 }, { "epoch": 0.18511296570730312, "grad_norm": 1.087174654006958, "learning_rate": 1.9940305676888433e-05, "loss": 1.0752, "step": 445 }, { "epoch": 0.18552894989990382, "grad_norm": 0.9600853323936462, "learning_rate": 1.9939815868894355e-05, "loss": 1.0879, "step": 446 }, { "epoch": 0.18594493409250448, "grad_norm": 0.9931787252426147, "learning_rate": 1.993932406566432e-05, "loss": 1.1024, "step": 447 }, { "epoch": 0.18636091828510518, "grad_norm": 0.9901102781295776, "learning_rate": 1.993883026729705e-05, "loss": 1.0577, "step": 448 }, { "epoch": 0.18677690247770584, "grad_norm": 27.26092529296875, "learning_rate": 1.9938334473891666e-05, "loss": 1.1688, "step": 449 }, { "epoch": 0.18719288667030654, "grad_norm": 0.890872597694397, "learning_rate": 1.993783668554769e-05, "loss": 1.1318, "step": 450 }, { "epoch": 0.1876088708629072, "grad_norm": 0.9122032523155212, "learning_rate": 1.9937336902365044e-05, "loss": 1.0286, "step": 451 }, { "epoch": 0.1880248550555079, "grad_norm": 0.882926344871521, "learning_rate": 1.993683512444405e-05, "loss": 1.032, "step": 452 }, { "epoch": 0.18844083924810856, "grad_norm": 0.9216632843017578, "learning_rate": 1.9936331351885424e-05, "loss": 1.1449, "step": 453 }, { "epoch": 0.18885682344070925, "grad_norm": 0.9233924746513367, "learning_rate": 1.9935825584790295e-05, "loss": 0.9461, "step": 454 }, { "epoch": 0.18927280763330995, "grad_norm": 0.8849272727966309, "learning_rate": 1.9935317823260187e-05, "loss": 1.0414, "step": 455 }, { "epoch": 0.1896887918259106, "grad_norm": 0.902531623840332, "learning_rate": 1.9934808067397025e-05, "loss": 1.1264, "step": 456 }, { "epoch": 0.1901047760185113, "grad_norm": 0.9148010015487671, "learning_rate": 1.9934296317303132e-05, "loss": 1.0345, "step": 457 }, { "epoch": 0.19052076021111197, "grad_norm": 0.8911391496658325, "learning_rate": 1.9933782573081235e-05, "loss": 0.9896, "step": 458 }, { "epoch": 0.19093674440371267, "grad_norm": 1.1076544523239136, "learning_rate": 1.9933266834834448e-05, "loss": 1.1756, "step": 459 }, { "epoch": 0.19135272859631333, "grad_norm": 0.906173586845398, "learning_rate": 1.9932749102666306e-05, "loss": 0.9995, "step": 460 }, { "epoch": 0.19176871278891403, "grad_norm": 0.9611719250679016, "learning_rate": 1.9932229376680733e-05, "loss": 1.056, "step": 461 }, { "epoch": 0.1921846969815147, "grad_norm": 0.9627857208251953, "learning_rate": 1.993170765698205e-05, "loss": 1.0655, "step": 462 }, { "epoch": 0.19260068117411538, "grad_norm": 0.9944802522659302, "learning_rate": 1.9931183943674992e-05, "loss": 1.06, "step": 463 }, { "epoch": 0.19301666536671608, "grad_norm": 35.48960876464844, "learning_rate": 1.9930658236864673e-05, "loss": 1.0062, "step": 464 }, { "epoch": 0.19343264955931674, "grad_norm": 0.9571504592895508, "learning_rate": 1.9930130536656626e-05, "loss": 1.1398, "step": 465 }, { "epoch": 0.19384863375191744, "grad_norm": 1.038862943649292, "learning_rate": 1.992960084315678e-05, "loss": 0.966, "step": 466 }, { "epoch": 0.1942646179445181, "grad_norm": 1.0720785856246948, "learning_rate": 1.9929069156471455e-05, "loss": 1.063, "step": 467 }, { "epoch": 0.1946806021371188, "grad_norm": 0.8971590399742126, "learning_rate": 1.9928535476707374e-05, "loss": 1.0137, "step": 468 }, { "epoch": 0.19509658632971946, "grad_norm": 0.9654543399810791, "learning_rate": 1.9927999803971676e-05, "loss": 1.0695, "step": 469 }, { "epoch": 0.19551257052232016, "grad_norm": 46.630218505859375, "learning_rate": 1.992746213837188e-05, "loss": 1.0816, "step": 470 }, { "epoch": 0.19592855471492082, "grad_norm": 0.9204821586608887, "learning_rate": 1.992692248001591e-05, "loss": 1.067, "step": 471 }, { "epoch": 0.19634453890752152, "grad_norm": 0.9442223906517029, "learning_rate": 1.9926380829012096e-05, "loss": 1.0859, "step": 472 }, { "epoch": 0.1967605231001222, "grad_norm": 0.9762701392173767, "learning_rate": 1.9925837185469166e-05, "loss": 1.0757, "step": 473 }, { "epoch": 0.19717650729272287, "grad_norm": 0.9677730798721313, "learning_rate": 1.9925291549496242e-05, "loss": 1.0482, "step": 474 }, { "epoch": 0.19759249148532357, "grad_norm": 0.937849223613739, "learning_rate": 1.9924743921202852e-05, "loss": 1.0516, "step": 475 }, { "epoch": 0.19800847567792423, "grad_norm": 0.9843096137046814, "learning_rate": 1.9924194300698927e-05, "loss": 1.0122, "step": 476 }, { "epoch": 0.19842445987052493, "grad_norm": 0.8854307532310486, "learning_rate": 1.9923642688094786e-05, "loss": 0.929, "step": 477 }, { "epoch": 0.1988404440631256, "grad_norm": 1.048207402229309, "learning_rate": 1.992308908350116e-05, "loss": 1.1339, "step": 478 }, { "epoch": 0.1992564282557263, "grad_norm": 1.1947593688964844, "learning_rate": 1.9922533487029177e-05, "loss": 1.16, "step": 479 }, { "epoch": 0.19967241244832695, "grad_norm": 43.683876037597656, "learning_rate": 1.9921975898790355e-05, "loss": 1.1512, "step": 480 }, { "epoch": 0.20008839664092765, "grad_norm": 1.0083413124084473, "learning_rate": 1.9921416318896628e-05, "loss": 1.0849, "step": 481 }, { "epoch": 0.20050438083352834, "grad_norm": 0.9708366990089417, "learning_rate": 1.9920854747460313e-05, "loss": 0.9676, "step": 482 }, { "epoch": 0.200920365026129, "grad_norm": 1.0153459310531616, "learning_rate": 1.9920291184594143e-05, "loss": 1.1018, "step": 483 }, { "epoch": 0.2013363492187297, "grad_norm": 0.9628456830978394, "learning_rate": 1.9919725630411237e-05, "loss": 1.088, "step": 484 }, { "epoch": 0.20175233341133036, "grad_norm": 140.67921447753906, "learning_rate": 1.9919158085025122e-05, "loss": 1.0292, "step": 485 }, { "epoch": 0.20216831760393106, "grad_norm": 1.0099544525146484, "learning_rate": 1.9918588548549727e-05, "loss": 1.0505, "step": 486 }, { "epoch": 0.20258430179653172, "grad_norm": 0.9849467873573303, "learning_rate": 1.991801702109937e-05, "loss": 0.8447, "step": 487 }, { "epoch": 0.20300028598913242, "grad_norm": 1.0162895917892456, "learning_rate": 1.9917443502788774e-05, "loss": 1.0519, "step": 488 }, { "epoch": 0.20341627018173308, "grad_norm": 0.9089003801345825, "learning_rate": 1.9916867993733066e-05, "loss": 1.0398, "step": 489 }, { "epoch": 0.20383225437433378, "grad_norm": 0.89839106798172, "learning_rate": 1.9916290494047766e-05, "loss": 0.9683, "step": 490 }, { "epoch": 0.20424823856693447, "grad_norm": 0.9736869931221008, "learning_rate": 1.9915711003848802e-05, "loss": 1.0388, "step": 491 }, { "epoch": 0.20466422275953514, "grad_norm": 1.0375293493270874, "learning_rate": 1.9915129523252487e-05, "loss": 1.1142, "step": 492 }, { "epoch": 0.20508020695213583, "grad_norm": 0.9568646550178528, "learning_rate": 1.991454605237555e-05, "loss": 1.1244, "step": 493 }, { "epoch": 0.2054961911447365, "grad_norm": 0.980621337890625, "learning_rate": 1.991396059133511e-05, "loss": 1.1422, "step": 494 }, { "epoch": 0.2059121753373372, "grad_norm": 0.9289761185646057, "learning_rate": 1.9913373140248687e-05, "loss": 1.0396, "step": 495 }, { "epoch": 0.20632815952993785, "grad_norm": 1.061082363128662, "learning_rate": 1.99127836992342e-05, "loss": 1.0952, "step": 496 }, { "epoch": 0.20674414372253855, "grad_norm": 227.12095642089844, "learning_rate": 1.991219226840997e-05, "loss": 1.0478, "step": 497 }, { "epoch": 0.2071601279151392, "grad_norm": 1.024208903312683, "learning_rate": 1.9911598847894714e-05, "loss": 1.0586, "step": 498 }, { "epoch": 0.2075761121077399, "grad_norm": 0.937858521938324, "learning_rate": 1.9911003437807554e-05, "loss": 1.0667, "step": 499 }, { "epoch": 0.2079920963003406, "grad_norm": 1.0490220785140991, "learning_rate": 1.9910406038268008e-05, "loss": 0.9856, "step": 500 }, { "epoch": 0.2079920963003406, "eval_loss": 0.9742390513420105, "eval_runtime": 1723.9075, "eval_samples_per_second": 3.823, "eval_steps_per_second": 1.912, "step": 500 }, { "epoch": 0.20840808049294127, "grad_norm": 4143.06591796875, "learning_rate": 1.9909806649395986e-05, "loss": 1.1043, "step": 501 }, { "epoch": 0.20882406468554196, "grad_norm": 1.0358145236968994, "learning_rate": 1.990920527131181e-05, "loss": 1.2401, "step": 502 }, { "epoch": 0.20924004887814263, "grad_norm": 1.018070936203003, "learning_rate": 1.99086019041362e-05, "loss": 0.8589, "step": 503 }, { "epoch": 0.20965603307074332, "grad_norm": 14.824268341064453, "learning_rate": 1.990799654799026e-05, "loss": 0.997, "step": 504 }, { "epoch": 0.21007201726334399, "grad_norm": 1.0036373138427734, "learning_rate": 1.990738920299551e-05, "loss": 1.0896, "step": 505 }, { "epoch": 0.21048800145594468, "grad_norm": 0.9147713780403137, "learning_rate": 1.9906779869273864e-05, "loss": 1.002, "step": 506 }, { "epoch": 0.21090398564854534, "grad_norm": 0.9546721577644348, "learning_rate": 1.9906168546947634e-05, "loss": 0.8478, "step": 507 }, { "epoch": 0.21131996984114604, "grad_norm": 9492.564453125, "learning_rate": 1.9905555236139533e-05, "loss": 1.0678, "step": 508 }, { "epoch": 0.2117359540337467, "grad_norm": 1.0289603471755981, "learning_rate": 1.9904939936972667e-05, "loss": 0.9859, "step": 509 }, { "epoch": 0.2121519382263474, "grad_norm": 1.0345205068588257, "learning_rate": 1.9904322649570552e-05, "loss": 1.0575, "step": 510 }, { "epoch": 0.2125679224189481, "grad_norm": 1.0464143753051758, "learning_rate": 1.9903703374057094e-05, "loss": 1.0728, "step": 511 }, { "epoch": 0.21298390661154876, "grad_norm": 1.0269359350204468, "learning_rate": 1.99030821105566e-05, "loss": 0.8761, "step": 512 }, { "epoch": 0.21339989080414945, "grad_norm": 1.048438310623169, "learning_rate": 1.990245885919378e-05, "loss": 1.0485, "step": 513 }, { "epoch": 0.21381587499675012, "grad_norm": 1.033692717552185, "learning_rate": 1.990183362009374e-05, "loss": 1.02, "step": 514 }, { "epoch": 0.2142318591893508, "grad_norm": 1.0566153526306152, "learning_rate": 1.990120639338198e-05, "loss": 1.0837, "step": 515 }, { "epoch": 0.21464784338195148, "grad_norm": 0.9666121006011963, "learning_rate": 1.9900577179184413e-05, "loss": 1.0788, "step": 516 }, { "epoch": 0.21506382757455217, "grad_norm": 0.9638078808784485, "learning_rate": 1.989994597762734e-05, "loss": 0.9717, "step": 517 }, { "epoch": 0.21547981176715283, "grad_norm": 0.9933183193206787, "learning_rate": 1.9899312788837458e-05, "loss": 1.053, "step": 518 }, { "epoch": 0.21589579595975353, "grad_norm": 1.0278891324996948, "learning_rate": 1.9898677612941874e-05, "loss": 0.9739, "step": 519 }, { "epoch": 0.21631178015235422, "grad_norm": 0.9589548707008362, "learning_rate": 1.9898040450068084e-05, "loss": 0.9236, "step": 520 }, { "epoch": 0.2167277643449549, "grad_norm": 1.0142326354980469, "learning_rate": 1.9897401300343985e-05, "loss": 1.0996, "step": 521 }, { "epoch": 0.21714374853755558, "grad_norm": 1.0015535354614258, "learning_rate": 1.989676016389788e-05, "loss": 0.9566, "step": 522 }, { "epoch": 0.21755973273015625, "grad_norm": 0.9645442366600037, "learning_rate": 1.989611704085846e-05, "loss": 0.9757, "step": 523 }, { "epoch": 0.21797571692275694, "grad_norm": 1.0508140325546265, "learning_rate": 1.989547193135483e-05, "loss": 1.0874, "step": 524 }, { "epoch": 0.2183917011153576, "grad_norm": 1.0415077209472656, "learning_rate": 1.9894824835516474e-05, "loss": 1.02, "step": 525 }, { "epoch": 0.2188076853079583, "grad_norm": 1.080237627029419, "learning_rate": 1.9894175753473284e-05, "loss": 1.0555, "step": 526 }, { "epoch": 0.21922366950055897, "grad_norm": 0.9380131363868713, "learning_rate": 1.989352468535556e-05, "loss": 1.0105, "step": 527 }, { "epoch": 0.21963965369315966, "grad_norm": 0.9831803441047668, "learning_rate": 1.9892871631293985e-05, "loss": 1.0158, "step": 528 }, { "epoch": 0.22005563788576035, "grad_norm": 7.770525932312012, "learning_rate": 1.989221659141965e-05, "loss": 1.0988, "step": 529 }, { "epoch": 0.22047162207836102, "grad_norm": 0.9965048432350159, "learning_rate": 1.989155956586404e-05, "loss": 0.9627, "step": 530 }, { "epoch": 0.2208876062709617, "grad_norm": 1.0598105192184448, "learning_rate": 1.989090055475904e-05, "loss": 1.0228, "step": 531 }, { "epoch": 0.22130359046356238, "grad_norm": 1.030524730682373, "learning_rate": 1.9890239558236947e-05, "loss": 1.0805, "step": 532 }, { "epoch": 0.22171957465616307, "grad_norm": 1.0234934091567993, "learning_rate": 1.9889576576430425e-05, "loss": 1.1024, "step": 533 }, { "epoch": 0.22213555884876374, "grad_norm": 0.9998660087585449, "learning_rate": 1.9888911609472568e-05, "loss": 0.9981, "step": 534 }, { "epoch": 0.22255154304136443, "grad_norm": 0.9393244385719299, "learning_rate": 1.9888244657496854e-05, "loss": 0.8793, "step": 535 }, { "epoch": 0.2229675272339651, "grad_norm": 1.1130553483963013, "learning_rate": 1.9887575720637157e-05, "loss": 1.1183, "step": 536 }, { "epoch": 0.2233835114265658, "grad_norm": 0.976149320602417, "learning_rate": 1.9886904799027757e-05, "loss": 0.9463, "step": 537 }, { "epoch": 0.22379949561916648, "grad_norm": 1.071610689163208, "learning_rate": 1.988623189280333e-05, "loss": 1.0565, "step": 538 }, { "epoch": 0.22421547981176715, "grad_norm": 0.9420391917228699, "learning_rate": 1.9885557002098948e-05, "loss": 1.0297, "step": 539 }, { "epoch": 0.22463146400436784, "grad_norm": 0.9499551653862, "learning_rate": 1.9884880127050082e-05, "loss": 1.0356, "step": 540 }, { "epoch": 0.2250474481969685, "grad_norm": 0.9973178505897522, "learning_rate": 1.9884201267792608e-05, "loss": 1.0218, "step": 541 }, { "epoch": 0.2254634323895692, "grad_norm": 1.0458414554595947, "learning_rate": 1.9883520424462784e-05, "loss": 1.0269, "step": 542 }, { "epoch": 0.22587941658216987, "grad_norm": 1.1280735731124878, "learning_rate": 1.988283759719729e-05, "loss": 1.0641, "step": 543 }, { "epoch": 0.22629540077477056, "grad_norm": 1.0307750701904297, "learning_rate": 1.9882152786133182e-05, "loss": 1.1066, "step": 544 }, { "epoch": 0.22671138496737123, "grad_norm": 0.9585261940956116, "learning_rate": 1.988146599140792e-05, "loss": 1.01, "step": 545 }, { "epoch": 0.22712736915997192, "grad_norm": 1.167088270187378, "learning_rate": 1.988077721315938e-05, "loss": 1.1855, "step": 546 }, { "epoch": 0.2275433533525726, "grad_norm": 1.0217463970184326, "learning_rate": 1.9880086451525805e-05, "loss": 1.0694, "step": 547 }, { "epoch": 0.22795933754517328, "grad_norm": 1.071991205215454, "learning_rate": 1.9879393706645865e-05, "loss": 0.9362, "step": 548 }, { "epoch": 0.22837532173777397, "grad_norm": 1001.6079711914062, "learning_rate": 1.987869897865861e-05, "loss": 1.0265, "step": 549 }, { "epoch": 0.22879130593037464, "grad_norm": 1.0453846454620361, "learning_rate": 1.9878002267703493e-05, "loss": 1.0022, "step": 550 }, { "epoch": 0.22920729012297533, "grad_norm": 1.0561352968215942, "learning_rate": 1.987730357392037e-05, "loss": 1.1315, "step": 551 }, { "epoch": 0.229623274315576, "grad_norm": 1.037642002105713, "learning_rate": 1.9876602897449488e-05, "loss": 1.1431, "step": 552 }, { "epoch": 0.2300392585081767, "grad_norm": 1.0311110019683838, "learning_rate": 1.9875900238431495e-05, "loss": 0.9646, "step": 553 }, { "epoch": 0.23045524270077736, "grad_norm": 1.0265607833862305, "learning_rate": 1.987519559700744e-05, "loss": 0.9495, "step": 554 }, { "epoch": 0.23087122689337805, "grad_norm": 1.002938151359558, "learning_rate": 1.9874488973318765e-05, "loss": 0.9761, "step": 555 }, { "epoch": 0.23128721108597874, "grad_norm": 1.0223989486694336, "learning_rate": 1.987378036750731e-05, "loss": 1.0142, "step": 556 }, { "epoch": 0.2317031952785794, "grad_norm": 1.105521559715271, "learning_rate": 1.987306977971532e-05, "loss": 1.1451, "step": 557 }, { "epoch": 0.2321191794711801, "grad_norm": 0.9998063445091248, "learning_rate": 1.987235721008542e-05, "loss": 1.0185, "step": 558 }, { "epoch": 0.23253516366378077, "grad_norm": 1.1388511657714844, "learning_rate": 1.987164265876066e-05, "loss": 1.12, "step": 559 }, { "epoch": 0.23295114785638146, "grad_norm": 1.0170528888702393, "learning_rate": 1.9870926125884465e-05, "loss": 0.9815, "step": 560 }, { "epoch": 0.23336713204898213, "grad_norm": 1.037130355834961, "learning_rate": 1.987020761160067e-05, "loss": 1.0862, "step": 561 }, { "epoch": 0.23378311624158282, "grad_norm": 1.0800973176956177, "learning_rate": 1.98694871160535e-05, "loss": 1.0295, "step": 562 }, { "epoch": 0.2341991004341835, "grad_norm": 0.9890981316566467, "learning_rate": 1.9868764639387584e-05, "loss": 0.9827, "step": 563 }, { "epoch": 0.23461508462678418, "grad_norm": 0.9698352813720703, "learning_rate": 1.9868040181747944e-05, "loss": 0.997, "step": 564 }, { "epoch": 0.23503106881938488, "grad_norm": 1.0517358779907227, "learning_rate": 1.986731374328e-05, "loss": 0.9456, "step": 565 }, { "epoch": 0.23544705301198554, "grad_norm": 0.966422975063324, "learning_rate": 1.9866585324129578e-05, "loss": 1.0055, "step": 566 }, { "epoch": 0.23586303720458623, "grad_norm": 1.0880509614944458, "learning_rate": 1.9865854924442885e-05, "loss": 1.0615, "step": 567 }, { "epoch": 0.2362790213971869, "grad_norm": 1.0649336576461792, "learning_rate": 1.9865122544366544e-05, "loss": 0.9581, "step": 568 }, { "epoch": 0.2366950055897876, "grad_norm": 1.010870099067688, "learning_rate": 1.9864388184047564e-05, "loss": 0.885, "step": 569 }, { "epoch": 0.23711098978238826, "grad_norm": 1.155692219734192, "learning_rate": 1.986365184363335e-05, "loss": 1.0863, "step": 570 }, { "epoch": 0.23752697397498895, "grad_norm": 1.0995519161224365, "learning_rate": 1.9862913523271715e-05, "loss": 1.1298, "step": 571 }, { "epoch": 0.23794295816758962, "grad_norm": 0.9765012264251709, "learning_rate": 1.9862173223110864e-05, "loss": 1.0546, "step": 572 }, { "epoch": 0.2383589423601903, "grad_norm": 2.490360975265503, "learning_rate": 1.9861430943299395e-05, "loss": 1.104, "step": 573 }, { "epoch": 0.238774926552791, "grad_norm": 1.0332121849060059, "learning_rate": 1.9860686683986304e-05, "loss": 0.9943, "step": 574 }, { "epoch": 0.23919091074539167, "grad_norm": 0.9736289381980896, "learning_rate": 1.9859940445320994e-05, "loss": 0.8871, "step": 575 }, { "epoch": 0.23960689493799237, "grad_norm": 1.1042436361312866, "learning_rate": 1.9859192227453255e-05, "loss": 1.0674, "step": 576 }, { "epoch": 0.24002287913059303, "grad_norm": 1.073133111000061, "learning_rate": 1.985844203053328e-05, "loss": 1.0653, "step": 577 }, { "epoch": 0.24043886332319372, "grad_norm": 1.0623294115066528, "learning_rate": 1.9857689854711655e-05, "loss": 1.0872, "step": 578 }, { "epoch": 0.2408548475157944, "grad_norm": 1.228774905204773, "learning_rate": 1.9856935700139367e-05, "loss": 1.0623, "step": 579 }, { "epoch": 0.24127083170839508, "grad_norm": 0.9538869857788086, "learning_rate": 1.9856179566967802e-05, "loss": 0.9286, "step": 580 }, { "epoch": 0.24168681590099575, "grad_norm": 265.9437255859375, "learning_rate": 1.985542145534873e-05, "loss": 0.977, "step": 581 }, { "epoch": 0.24210280009359644, "grad_norm": 1.1220144033432007, "learning_rate": 1.985466136543434e-05, "loss": 1.1248, "step": 582 }, { "epoch": 0.24251878428619714, "grad_norm": 1.0860921144485474, "learning_rate": 1.9853899297377198e-05, "loss": 1.0522, "step": 583 }, { "epoch": 0.2429347684787978, "grad_norm": 1.1188560724258423, "learning_rate": 1.9853135251330278e-05, "loss": 1.0155, "step": 584 }, { "epoch": 0.2433507526713985, "grad_norm": 1.0845099687576294, "learning_rate": 1.985236922744695e-05, "loss": 1.0656, "step": 585 }, { "epoch": 0.24376673686399916, "grad_norm": 1.0723683834075928, "learning_rate": 1.9851601225880974e-05, "loss": 0.9942, "step": 586 }, { "epoch": 0.24418272105659986, "grad_norm": 0.9834114909172058, "learning_rate": 1.9850831246786516e-05, "loss": 1.1009, "step": 587 }, { "epoch": 0.24459870524920052, "grad_norm": 1.0134971141815186, "learning_rate": 1.9850059290318137e-05, "loss": 1.0804, "step": 588 }, { "epoch": 0.24501468944180121, "grad_norm": 0.988772988319397, "learning_rate": 1.984928535663079e-05, "loss": 1.0502, "step": 589 }, { "epoch": 0.24543067363440188, "grad_norm": 1.0213228464126587, "learning_rate": 1.984850944587983e-05, "loss": 0.971, "step": 590 }, { "epoch": 0.24584665782700257, "grad_norm": 1.032547950744629, "learning_rate": 1.9847731558221005e-05, "loss": 1.0557, "step": 591 }, { "epoch": 0.24626264201960327, "grad_norm": 1.0025687217712402, "learning_rate": 1.9846951693810458e-05, "loss": 1.1078, "step": 592 }, { "epoch": 0.24667862621220393, "grad_norm": 1.0726220607757568, "learning_rate": 1.9846169852804743e-05, "loss": 0.8936, "step": 593 }, { "epoch": 0.24709461040480463, "grad_norm": 1.0483043193817139, "learning_rate": 1.9845386035360793e-05, "loss": 0.9905, "step": 594 }, { "epoch": 0.2475105945974053, "grad_norm": 1.0846859216690063, "learning_rate": 1.984460024163594e-05, "loss": 1.0778, "step": 595 }, { "epoch": 0.24792657879000599, "grad_norm": 0.9614548683166504, "learning_rate": 1.984381247178793e-05, "loss": 0.8871, "step": 596 }, { "epoch": 0.24834256298260665, "grad_norm": 1057.4163818359375, "learning_rate": 1.9843022725974886e-05, "loss": 1.0161, "step": 597 }, { "epoch": 0.24875854717520735, "grad_norm": 1.2261807918548584, "learning_rate": 1.9842231004355338e-05, "loss": 1.1757, "step": 598 }, { "epoch": 0.249174531367808, "grad_norm": 1.0463498830795288, "learning_rate": 1.9841437307088208e-05, "loss": 0.9824, "step": 599 }, { "epoch": 0.2495905155604087, "grad_norm": 1.117531418800354, "learning_rate": 1.9840641634332813e-05, "loss": 0.9907, "step": 600 }, { "epoch": 0.25000649975300937, "grad_norm": 1.1001218557357788, "learning_rate": 1.9839843986248874e-05, "loss": 1.026, "step": 601 }, { "epoch": 0.25042248394561006, "grad_norm": 1.1421513557434082, "learning_rate": 1.9839044362996503e-05, "loss": 1.048, "step": 602 }, { "epoch": 0.25083846813821076, "grad_norm": 1.0794428586959839, "learning_rate": 1.983824276473621e-05, "loss": 1.0307, "step": 603 }, { "epoch": 0.25125445233081145, "grad_norm": 1.0490968227386475, "learning_rate": 1.98374391916289e-05, "loss": 0.9129, "step": 604 }, { "epoch": 0.2516704365234121, "grad_norm": 1.0812203884124756, "learning_rate": 1.9836633643835875e-05, "loss": 1.0254, "step": 605 }, { "epoch": 0.2520864207160128, "grad_norm": 15.036090850830078, "learning_rate": 1.9835826121518838e-05, "loss": 1.0376, "step": 606 }, { "epoch": 0.2525024049086135, "grad_norm": 1.1588501930236816, "learning_rate": 1.9835016624839884e-05, "loss": 1.0004, "step": 607 }, { "epoch": 0.25291838910121417, "grad_norm": 1.00113046169281, "learning_rate": 1.9834205153961497e-05, "loss": 0.9237, "step": 608 }, { "epoch": 0.2533343732938148, "grad_norm": 1.2047077417373657, "learning_rate": 1.9833391709046575e-05, "loss": 0.9622, "step": 609 }, { "epoch": 0.2537503574864155, "grad_norm": 1.0711098909378052, "learning_rate": 1.9832576290258396e-05, "loss": 1.0758, "step": 610 }, { "epoch": 0.2541663416790162, "grad_norm": 1.088252305984497, "learning_rate": 1.983175889776064e-05, "loss": 1.1907, "step": 611 }, { "epoch": 0.2545823258716169, "grad_norm": 1.0628526210784912, "learning_rate": 1.9830939531717387e-05, "loss": 1.005, "step": 612 }, { "epoch": 0.2549983100642176, "grad_norm": 1.052303671836853, "learning_rate": 1.9830118192293106e-05, "loss": 0.9838, "step": 613 }, { "epoch": 0.2554142942568182, "grad_norm": 1.113544225692749, "learning_rate": 1.982929487965267e-05, "loss": 1.1217, "step": 614 }, { "epoch": 0.2558302784494189, "grad_norm": 1.0441328287124634, "learning_rate": 1.9828469593961343e-05, "loss": 0.9415, "step": 615 }, { "epoch": 0.2562462626420196, "grad_norm": 1.029868483543396, "learning_rate": 1.9827642335384784e-05, "loss": 0.9084, "step": 616 }, { "epoch": 0.2566622468346203, "grad_norm": 1.133396029472351, "learning_rate": 1.982681310408905e-05, "loss": 1.0339, "step": 617 }, { "epoch": 0.25707823102722094, "grad_norm": 0.9724785089492798, "learning_rate": 1.9825981900240597e-05, "loss": 0.9791, "step": 618 }, { "epoch": 0.25749421521982163, "grad_norm": 1.1452714204788208, "learning_rate": 1.982514872400627e-05, "loss": 1.0692, "step": 619 }, { "epoch": 0.2579101994124223, "grad_norm": 1.0590906143188477, "learning_rate": 1.9824313575553316e-05, "loss": 1.0493, "step": 620 }, { "epoch": 0.258326183605023, "grad_norm": 1.0763423442840576, "learning_rate": 1.9823476455049374e-05, "loss": 1.0018, "step": 621 }, { "epoch": 0.2587421677976237, "grad_norm": 1.11001455783844, "learning_rate": 1.9822637362662486e-05, "loss": 0.9831, "step": 622 }, { "epoch": 0.25915815199022435, "grad_norm": 1.0652194023132324, "learning_rate": 1.982179629856108e-05, "loss": 0.9962, "step": 623 }, { "epoch": 0.25957413618282504, "grad_norm": 1.134026288986206, "learning_rate": 1.982095326291398e-05, "loss": 1.0323, "step": 624 }, { "epoch": 0.25999012037542574, "grad_norm": 1.0430129766464233, "learning_rate": 1.9820108255890417e-05, "loss": 0.8792, "step": 625 }, { "epoch": 0.26040610456802643, "grad_norm": 1.0993627309799194, "learning_rate": 1.9819261277660007e-05, "loss": 0.9122, "step": 626 }, { "epoch": 0.26082208876062707, "grad_norm": 1.1403800249099731, "learning_rate": 1.9818412328392772e-05, "loss": 1.0309, "step": 627 }, { "epoch": 0.26123807295322776, "grad_norm": 1.2236120700836182, "learning_rate": 1.9817561408259115e-05, "loss": 1.0347, "step": 628 }, { "epoch": 0.26165405714582846, "grad_norm": 1.1536449193954468, "learning_rate": 1.981670851742984e-05, "loss": 1.0831, "step": 629 }, { "epoch": 0.26207004133842915, "grad_norm": 1.137566328048706, "learning_rate": 1.9815853656076162e-05, "loss": 1.0223, "step": 630 }, { "epoch": 0.26248602553102984, "grad_norm": 1.0239369869232178, "learning_rate": 1.981499682436967e-05, "loss": 0.8359, "step": 631 }, { "epoch": 0.2629020097236305, "grad_norm": 1.1944313049316406, "learning_rate": 1.9814138022482353e-05, "loss": 1.062, "step": 632 }, { "epoch": 0.2633179939162312, "grad_norm": 1.107096791267395, "learning_rate": 1.9813277250586613e-05, "loss": 1.0443, "step": 633 }, { "epoch": 0.26373397810883187, "grad_norm": 1.0628607273101807, "learning_rate": 1.981241450885522e-05, "loss": 0.8634, "step": 634 }, { "epoch": 0.26414996230143256, "grad_norm": 1.0118048191070557, "learning_rate": 1.9811549797461366e-05, "loss": 1.0019, "step": 635 }, { "epoch": 0.2645659464940332, "grad_norm": 1.0581936836242676, "learning_rate": 1.981068311657862e-05, "loss": 1.0319, "step": 636 }, { "epoch": 0.2649819306866339, "grad_norm": 1.2589290142059326, "learning_rate": 1.9809814466380945e-05, "loss": 1.0268, "step": 637 }, { "epoch": 0.2653979148792346, "grad_norm": 1.0948143005371094, "learning_rate": 1.9808943847042722e-05, "loss": 0.9675, "step": 638 }, { "epoch": 0.2658138990718353, "grad_norm": 1.0595015287399292, "learning_rate": 1.98080712587387e-05, "loss": 1.0098, "step": 639 }, { "epoch": 0.266229883264436, "grad_norm": 45.35832977294922, "learning_rate": 1.980719670164404e-05, "loss": 1.0051, "step": 640 }, { "epoch": 0.2666458674570366, "grad_norm": 1.1236295700073242, "learning_rate": 1.980632017593429e-05, "loss": 1.0967, "step": 641 }, { "epoch": 0.2670618516496373, "grad_norm": 1.153807282447815, "learning_rate": 1.9805441681785402e-05, "loss": 1.1104, "step": 642 }, { "epoch": 0.267477835842238, "grad_norm": 1.057943344116211, "learning_rate": 1.9804561219373714e-05, "loss": 1.1229, "step": 643 }, { "epoch": 0.2678938200348387, "grad_norm": 77.7723388671875, "learning_rate": 1.9803678788875965e-05, "loss": 0.9337, "step": 644 }, { "epoch": 0.26830980422743933, "grad_norm": 1.1517904996871948, "learning_rate": 1.980279439046928e-05, "loss": 0.9861, "step": 645 }, { "epoch": 0.26872578842004, "grad_norm": 1.1755566596984863, "learning_rate": 1.9801908024331194e-05, "loss": 0.9343, "step": 646 }, { "epoch": 0.2691417726126407, "grad_norm": 1.1632273197174072, "learning_rate": 1.9801019690639624e-05, "loss": 1.0157, "step": 647 }, { "epoch": 0.2695577568052414, "grad_norm": 1.1167634725570679, "learning_rate": 1.980012938957289e-05, "loss": 0.9867, "step": 648 }, { "epoch": 0.2699737409978421, "grad_norm": 1.1246815919876099, "learning_rate": 1.9799237121309697e-05, "loss": 1.0042, "step": 649 }, { "epoch": 0.27038972519044274, "grad_norm": 1.224506139755249, "learning_rate": 1.9798342886029163e-05, "loss": 1.0029, "step": 650 }, { "epoch": 0.27080570938304344, "grad_norm": 1.1478439569473267, "learning_rate": 1.979744668391078e-05, "loss": 1.1004, "step": 651 }, { "epoch": 0.27122169357564413, "grad_norm": 1.2134697437286377, "learning_rate": 1.9796548515134444e-05, "loss": 0.9793, "step": 652 }, { "epoch": 0.2716376777682448, "grad_norm": 1.1205298900604248, "learning_rate": 1.9795648379880452e-05, "loss": 0.9382, "step": 653 }, { "epoch": 0.27205366196084546, "grad_norm": 1.1958024501800537, "learning_rate": 1.9794746278329485e-05, "loss": 1.0874, "step": 654 }, { "epoch": 0.27246964615344615, "grad_norm": 1.1456528902053833, "learning_rate": 1.9793842210662625e-05, "loss": 1.0747, "step": 655 }, { "epoch": 0.27288563034604685, "grad_norm": 1.1108404397964478, "learning_rate": 1.9792936177061348e-05, "loss": 1.0269, "step": 656 }, { "epoch": 0.27330161453864754, "grad_norm": 1.08179771900177, "learning_rate": 1.9792028177707522e-05, "loss": 0.9464, "step": 657 }, { "epoch": 0.27371759873124824, "grad_norm": 0.9799919128417969, "learning_rate": 1.979111821278341e-05, "loss": 0.8739, "step": 658 }, { "epoch": 0.2741335829238489, "grad_norm": 1.0520695447921753, "learning_rate": 1.9790206282471677e-05, "loss": 1.1277, "step": 659 }, { "epoch": 0.27454956711644957, "grad_norm": 1.2417911291122437, "learning_rate": 1.9789292386955366e-05, "loss": 1.0044, "step": 660 }, { "epoch": 0.27496555130905026, "grad_norm": 1.0409661531448364, "learning_rate": 1.9788376526417932e-05, "loss": 0.9551, "step": 661 }, { "epoch": 0.27538153550165095, "grad_norm": 1.1316101551055908, "learning_rate": 1.9787458701043215e-05, "loss": 0.9684, "step": 662 }, { "epoch": 0.2757975196942516, "grad_norm": 1.1982522010803223, "learning_rate": 1.9786538911015456e-05, "loss": 0.9177, "step": 663 }, { "epoch": 0.2762135038868523, "grad_norm": 1.1016120910644531, "learning_rate": 1.978561715651928e-05, "loss": 1.0915, "step": 664 }, { "epoch": 0.276629488079453, "grad_norm": 1.1569855213165283, "learning_rate": 1.9784693437739717e-05, "loss": 1.0096, "step": 665 }, { "epoch": 0.27704547227205367, "grad_norm": 1.2141473293304443, "learning_rate": 1.978376775486218e-05, "loss": 1.0071, "step": 666 }, { "epoch": 0.27746145646465437, "grad_norm": 1.075738787651062, "learning_rate": 1.9782840108072492e-05, "loss": 1.007, "step": 667 }, { "epoch": 0.277877440657255, "grad_norm": 1.0846142768859863, "learning_rate": 1.9781910497556852e-05, "loss": 0.9146, "step": 668 }, { "epoch": 0.2782934248498557, "grad_norm": 1.1614549160003662, "learning_rate": 1.978097892350187e-05, "loss": 1.0571, "step": 669 }, { "epoch": 0.2787094090424564, "grad_norm": 1.2333179712295532, "learning_rate": 1.978004538609454e-05, "loss": 1.118, "step": 670 }, { "epoch": 0.2791253932350571, "grad_norm": 1.0789361000061035, "learning_rate": 1.977910988552225e-05, "loss": 0.9503, "step": 671 }, { "epoch": 0.2795413774276577, "grad_norm": 1.1400907039642334, "learning_rate": 1.9778172421972783e-05, "loss": 1.0419, "step": 672 }, { "epoch": 0.2799573616202584, "grad_norm": 1.1590193510055542, "learning_rate": 1.9777232995634326e-05, "loss": 0.9668, "step": 673 }, { "epoch": 0.2803733458128591, "grad_norm": 1.2736048698425293, "learning_rate": 1.9776291606695444e-05, "loss": 1.0276, "step": 674 }, { "epoch": 0.2807893300054598, "grad_norm": 1.1979082822799683, "learning_rate": 1.9775348255345107e-05, "loss": 1.0658, "step": 675 }, { "epoch": 0.2812053141980605, "grad_norm": 1.1830520629882812, "learning_rate": 1.977440294177267e-05, "loss": 0.9964, "step": 676 }, { "epoch": 0.28162129839066113, "grad_norm": 1.1157934665679932, "learning_rate": 1.97734556661679e-05, "loss": 0.9405, "step": 677 }, { "epoch": 0.28203728258326183, "grad_norm": 1.1800243854522705, "learning_rate": 1.9772506428720933e-05, "loss": 1.0724, "step": 678 }, { "epoch": 0.2824532667758625, "grad_norm": 1.2236616611480713, "learning_rate": 1.9771555229622313e-05, "loss": 1.0755, "step": 679 }, { "epoch": 0.2828692509684632, "grad_norm": 1.221869945526123, "learning_rate": 1.977060206906298e-05, "loss": 1.1089, "step": 680 }, { "epoch": 0.28328523516106385, "grad_norm": 1.139997124671936, "learning_rate": 1.9769646947234262e-05, "loss": 1.0003, "step": 681 }, { "epoch": 0.28370121935366455, "grad_norm": 1.1112861633300781, "learning_rate": 1.9768689864327882e-05, "loss": 1.0587, "step": 682 }, { "epoch": 0.28411720354626524, "grad_norm": 1.170926570892334, "learning_rate": 1.9767730820535953e-05, "loss": 1.0169, "step": 683 }, { "epoch": 0.28453318773886593, "grad_norm": 1.1467214822769165, "learning_rate": 1.9766769816050993e-05, "loss": 1.1469, "step": 684 }, { "epoch": 0.2849491719314666, "grad_norm": 1.1167900562286377, "learning_rate": 1.97658068510659e-05, "loss": 1.0326, "step": 685 }, { "epoch": 0.28536515612406727, "grad_norm": 1.1864562034606934, "learning_rate": 1.9764841925773973e-05, "loss": 1.0276, "step": 686 }, { "epoch": 0.28578114031666796, "grad_norm": 1.115960717201233, "learning_rate": 1.9763875040368906e-05, "loss": 0.8831, "step": 687 }, { "epoch": 0.28619712450926865, "grad_norm": 1.0830838680267334, "learning_rate": 1.9762906195044782e-05, "loss": 1.0566, "step": 688 }, { "epoch": 0.28661310870186935, "grad_norm": 1.1530202627182007, "learning_rate": 1.9761935389996077e-05, "loss": 1.0265, "step": 689 }, { "epoch": 0.28702909289447, "grad_norm": 1.2053114175796509, "learning_rate": 1.9760962625417665e-05, "loss": 0.9666, "step": 690 }, { "epoch": 0.2874450770870707, "grad_norm": 1.2381350994110107, "learning_rate": 1.9759987901504804e-05, "loss": 1.0657, "step": 691 }, { "epoch": 0.28786106127967137, "grad_norm": 1.2298023700714111, "learning_rate": 1.975901121845316e-05, "loss": 0.9884, "step": 692 }, { "epoch": 0.28827704547227206, "grad_norm": 1.1304867267608643, "learning_rate": 1.9758032576458786e-05, "loss": 0.9236, "step": 693 }, { "epoch": 0.28869302966487276, "grad_norm": 1.1901757717132568, "learning_rate": 1.9757051975718116e-05, "loss": 1.0654, "step": 694 }, { "epoch": 0.2891090138574734, "grad_norm": 1.1868036985397339, "learning_rate": 1.9756069416427998e-05, "loss": 1.0053, "step": 695 }, { "epoch": 0.2895249980500741, "grad_norm": 1.20746648311615, "learning_rate": 1.9755084898785654e-05, "loss": 1.0325, "step": 696 }, { "epoch": 0.2899409822426748, "grad_norm": 1.0762369632720947, "learning_rate": 1.9754098422988713e-05, "loss": 0.8808, "step": 697 }, { "epoch": 0.2903569664352755, "grad_norm": 1.2416399717330933, "learning_rate": 1.9753109989235193e-05, "loss": 0.997, "step": 698 }, { "epoch": 0.2907729506278761, "grad_norm": 1.2089097499847412, "learning_rate": 1.97521195977235e-05, "loss": 1.0632, "step": 699 }, { "epoch": 0.2911889348204768, "grad_norm": 1.0795011520385742, "learning_rate": 1.9751127248652443e-05, "loss": 0.9281, "step": 700 }, { "epoch": 0.2916049190130775, "grad_norm": 1.2301104068756104, "learning_rate": 1.9750132942221212e-05, "loss": 1.1001, "step": 701 }, { "epoch": 0.2920209032056782, "grad_norm": 1.2068411111831665, "learning_rate": 1.9749136678629396e-05, "loss": 0.9438, "step": 702 }, { "epoch": 0.2924368873982789, "grad_norm": 1.165753722190857, "learning_rate": 1.974813845807698e-05, "loss": 0.8955, "step": 703 }, { "epoch": 0.2928528715908795, "grad_norm": 1.1278396844863892, "learning_rate": 1.9747138280764334e-05, "loss": 1.0736, "step": 704 }, { "epoch": 0.2932688557834802, "grad_norm": 1.0809954404830933, "learning_rate": 1.9746136146892232e-05, "loss": 1.0241, "step": 705 }, { "epoch": 0.2936848399760809, "grad_norm": 1.1630949974060059, "learning_rate": 1.9745132056661828e-05, "loss": 1.0641, "step": 706 }, { "epoch": 0.2941008241686816, "grad_norm": 1.1826828718185425, "learning_rate": 1.974412601027468e-05, "loss": 1.0642, "step": 707 }, { "epoch": 0.29451680836128225, "grad_norm": 1.273723840713501, "learning_rate": 1.9743118007932728e-05, "loss": 1.0657, "step": 708 }, { "epoch": 0.29493279255388294, "grad_norm": 1.0370484590530396, "learning_rate": 1.974210804983831e-05, "loss": 0.9432, "step": 709 }, { "epoch": 0.29534877674648363, "grad_norm": 1.4919400215148926, "learning_rate": 1.9741096136194162e-05, "loss": 0.9848, "step": 710 }, { "epoch": 0.2957647609390843, "grad_norm": 1.2141481637954712, "learning_rate": 1.9740082267203406e-05, "loss": 1.0447, "step": 711 }, { "epoch": 0.296180745131685, "grad_norm": 1.1195968389511108, "learning_rate": 1.973906644306955e-05, "loss": 1.0367, "step": 712 }, { "epoch": 0.29659672932428566, "grad_norm": 1.0475693941116333, "learning_rate": 1.973804866399651e-05, "loss": 0.8746, "step": 713 }, { "epoch": 0.29701271351688635, "grad_norm": 1.0946810245513916, "learning_rate": 1.9737028930188585e-05, "loss": 1.0296, "step": 714 }, { "epoch": 0.29742869770948704, "grad_norm": 1.1322277784347534, "learning_rate": 1.973600724185047e-05, "loss": 1.0263, "step": 715 }, { "epoch": 0.29784468190208774, "grad_norm": 5.330412864685059, "learning_rate": 1.9734983599187245e-05, "loss": 1.0153, "step": 716 }, { "epoch": 0.2982606660946884, "grad_norm": 7.498377323150635, "learning_rate": 1.9733958002404387e-05, "loss": 1.0327, "step": 717 }, { "epoch": 0.29867665028728907, "grad_norm": 1.2764872312545776, "learning_rate": 1.973293045170777e-05, "loss": 1.0319, "step": 718 }, { "epoch": 0.29909263447988976, "grad_norm": 1.2265570163726807, "learning_rate": 1.9731900947303657e-05, "loss": 1.0847, "step": 719 }, { "epoch": 0.29950861867249046, "grad_norm": 1.010620355606079, "learning_rate": 1.97308694893987e-05, "loss": 1.0066, "step": 720 }, { "epoch": 0.29992460286509115, "grad_norm": 1.136383295059204, "learning_rate": 1.9729836078199942e-05, "loss": 1.0788, "step": 721 }, { "epoch": 0.3003405870576918, "grad_norm": 1.251219391822815, "learning_rate": 1.9728800713914825e-05, "loss": 0.9662, "step": 722 }, { "epoch": 0.3007565712502925, "grad_norm": 1.0852371454238892, "learning_rate": 1.972776339675118e-05, "loss": 0.8996, "step": 723 }, { "epoch": 0.3011725554428932, "grad_norm": 1.1538898944854736, "learning_rate": 1.9726724126917226e-05, "loss": 1.0335, "step": 724 }, { "epoch": 0.30158853963549387, "grad_norm": 50.41621780395508, "learning_rate": 1.9725682904621583e-05, "loss": 1.0178, "step": 725 }, { "epoch": 0.3020045238280945, "grad_norm": 134.26663208007812, "learning_rate": 1.9724639730073253e-05, "loss": 1.0135, "step": 726 }, { "epoch": 0.3024205080206952, "grad_norm": 1.1845084428787231, "learning_rate": 1.9723594603481633e-05, "loss": 0.8785, "step": 727 }, { "epoch": 0.3028364922132959, "grad_norm": 1.151276707649231, "learning_rate": 1.9722547525056518e-05, "loss": 1.0002, "step": 728 }, { "epoch": 0.3032524764058966, "grad_norm": 1.237545371055603, "learning_rate": 1.9721498495008086e-05, "loss": 1.0269, "step": 729 }, { "epoch": 0.3036684605984973, "grad_norm": 1.182107925415039, "learning_rate": 1.972044751354691e-05, "loss": 1.0721, "step": 730 }, { "epoch": 0.3040844447910979, "grad_norm": 1.0820446014404297, "learning_rate": 1.9719394580883957e-05, "loss": 0.9518, "step": 731 }, { "epoch": 0.3045004289836986, "grad_norm": 1.1278892755508423, "learning_rate": 1.9718339697230587e-05, "loss": 0.9944, "step": 732 }, { "epoch": 0.3049164131762993, "grad_norm": 1.233481764793396, "learning_rate": 1.971728286279854e-05, "loss": 1.0718, "step": 733 }, { "epoch": 0.3053323973689, "grad_norm": 1.1551762819290161, "learning_rate": 1.9716224077799964e-05, "loss": 1.0575, "step": 734 }, { "epoch": 0.30574838156150064, "grad_norm": 1.1678874492645264, "learning_rate": 1.971516334244739e-05, "loss": 0.9193, "step": 735 }, { "epoch": 0.30616436575410133, "grad_norm": 1.296045184135437, "learning_rate": 1.9714100656953738e-05, "loss": 0.916, "step": 736 }, { "epoch": 0.306580349946702, "grad_norm": 1.2009780406951904, "learning_rate": 1.9713036021532324e-05, "loss": 0.8988, "step": 737 }, { "epoch": 0.3069963341393027, "grad_norm": 2.50976824760437, "learning_rate": 1.9711969436396852e-05, "loss": 0.9921, "step": 738 }, { "epoch": 0.3074123183319034, "grad_norm": 1.2645121812820435, "learning_rate": 1.9710900901761424e-05, "loss": 1.1038, "step": 739 }, { "epoch": 0.30782830252450405, "grad_norm": 1.1748161315917969, "learning_rate": 1.9709830417840524e-05, "loss": 1.077, "step": 740 }, { "epoch": 0.30824428671710474, "grad_norm": 1.1044466495513916, "learning_rate": 1.9708757984849035e-05, "loss": 1.0017, "step": 741 }, { "epoch": 0.30866027090970544, "grad_norm": 1.1365612745285034, "learning_rate": 1.970768360300223e-05, "loss": 0.8831, "step": 742 }, { "epoch": 0.30907625510230613, "grad_norm": 1.1993780136108398, "learning_rate": 1.970660727251577e-05, "loss": 0.9927, "step": 743 }, { "epoch": 0.30949223929490677, "grad_norm": 33808.921875, "learning_rate": 1.9705528993605703e-05, "loss": 0.9352, "step": 744 }, { "epoch": 0.30990822348750746, "grad_norm": 1.2001038789749146, "learning_rate": 1.970444876648848e-05, "loss": 0.9171, "step": 745 }, { "epoch": 0.31032420768010816, "grad_norm": 1.1888773441314697, "learning_rate": 1.9703366591380933e-05, "loss": 1.0155, "step": 746 }, { "epoch": 0.31074019187270885, "grad_norm": 1.301430583000183, "learning_rate": 1.9702282468500296e-05, "loss": 0.9454, "step": 747 }, { "epoch": 0.31115617606530954, "grad_norm": 1.351517915725708, "learning_rate": 1.9701196398064177e-05, "loss": 1.057, "step": 748 }, { "epoch": 0.3115721602579102, "grad_norm": 1.099715232849121, "learning_rate": 1.9700108380290593e-05, "loss": 0.899, "step": 749 }, { "epoch": 0.3119881444505109, "grad_norm": 1.2550445795059204, "learning_rate": 1.969901841539794e-05, "loss": 1.0835, "step": 750 }, { "epoch": 0.31240412864311157, "grad_norm": 1.160928726196289, "learning_rate": 1.969792650360501e-05, "loss": 0.9894, "step": 751 }, { "epoch": 0.31282011283571226, "grad_norm": 1.1881216764450073, "learning_rate": 1.9696832645130977e-05, "loss": 1.0644, "step": 752 }, { "epoch": 0.3132360970283129, "grad_norm": 1.2760207653045654, "learning_rate": 1.9695736840195427e-05, "loss": 1.0342, "step": 753 }, { "epoch": 0.3136520812209136, "grad_norm": 1.266791820526123, "learning_rate": 1.9694639089018307e-05, "loss": 1.043, "step": 754 }, { "epoch": 0.3140680654135143, "grad_norm": 1.0743842124938965, "learning_rate": 1.9693539391819983e-05, "loss": 0.9002, "step": 755 }, { "epoch": 0.314484049606115, "grad_norm": 1.1326977014541626, "learning_rate": 1.969243774882119e-05, "loss": 0.9584, "step": 756 }, { "epoch": 0.3149000337987157, "grad_norm": 1.195320725440979, "learning_rate": 1.969133416024307e-05, "loss": 1.0075, "step": 757 }, { "epoch": 0.3153160179913163, "grad_norm": 1.1387355327606201, "learning_rate": 1.969022862630715e-05, "loss": 1.0555, "step": 758 }, { "epoch": 0.315732002183917, "grad_norm": 30044.65625, "learning_rate": 1.9689121147235334e-05, "loss": 0.9869, "step": 759 }, { "epoch": 0.3161479863765177, "grad_norm": 1.1006819009780884, "learning_rate": 1.9688011723249936e-05, "loss": 0.9113, "step": 760 }, { "epoch": 0.3165639705691184, "grad_norm": 1.1766600608825684, "learning_rate": 1.9686900354573653e-05, "loss": 0.9209, "step": 761 }, { "epoch": 0.31697995476171903, "grad_norm": 1.229279637336731, "learning_rate": 1.968578704142957e-05, "loss": 0.969, "step": 762 }, { "epoch": 0.3173959389543197, "grad_norm": 1.2574024200439453, "learning_rate": 1.9684671784041166e-05, "loss": 0.9752, "step": 763 }, { "epoch": 0.3178119231469204, "grad_norm": 1.1648303270339966, "learning_rate": 1.9683554582632304e-05, "loss": 0.9443, "step": 764 }, { "epoch": 0.3182279073395211, "grad_norm": 1.0514328479766846, "learning_rate": 1.9682435437427244e-05, "loss": 0.8295, "step": 765 }, { "epoch": 0.3186438915321218, "grad_norm": 1.1310639381408691, "learning_rate": 1.968131434865064e-05, "loss": 0.942, "step": 766 }, { "epoch": 0.31905987572472244, "grad_norm": 1.1488736867904663, "learning_rate": 1.9680191316527522e-05, "loss": 0.9146, "step": 767 }, { "epoch": 0.31947585991732314, "grad_norm": 1.1792199611663818, "learning_rate": 1.967906634128332e-05, "loss": 0.9079, "step": 768 }, { "epoch": 0.31989184410992383, "grad_norm": 1.2693843841552734, "learning_rate": 1.9677939423143854e-05, "loss": 1.043, "step": 769 }, { "epoch": 0.3203078283025245, "grad_norm": 1.2381826639175415, "learning_rate": 1.9676810562335333e-05, "loss": 1.0175, "step": 770 }, { "epoch": 0.32072381249512516, "grad_norm": 1.1295031309127808, "learning_rate": 1.967567975908435e-05, "loss": 1.0822, "step": 771 }, { "epoch": 0.32113979668772585, "grad_norm": 1.127444863319397, "learning_rate": 1.96745470136179e-05, "loss": 1.0289, "step": 772 }, { "epoch": 0.32155578088032655, "grad_norm": 1.088637113571167, "learning_rate": 1.9673412326163355e-05, "loss": 0.919, "step": 773 }, { "epoch": 0.32197176507292724, "grad_norm": 1.1318120956420898, "learning_rate": 1.9672275696948487e-05, "loss": 1.008, "step": 774 }, { "epoch": 0.32238774926552793, "grad_norm": 1.2156147956848145, "learning_rate": 1.9671137126201448e-05, "loss": 0.8898, "step": 775 }, { "epoch": 0.32280373345812857, "grad_norm": 1.2303749322891235, "learning_rate": 1.9669996614150792e-05, "loss": 1.0249, "step": 776 }, { "epoch": 0.32321971765072927, "grad_norm": 1.2127894163131714, "learning_rate": 1.966885416102545e-05, "loss": 0.9186, "step": 777 }, { "epoch": 0.32363570184332996, "grad_norm": 1.211269497871399, "learning_rate": 1.966770976705475e-05, "loss": 1.0002, "step": 778 }, { "epoch": 0.32405168603593065, "grad_norm": 1.1904046535491943, "learning_rate": 1.9666563432468414e-05, "loss": 0.9648, "step": 779 }, { "epoch": 0.3244676702285313, "grad_norm": 1.0780136585235596, "learning_rate": 1.966541515749654e-05, "loss": 0.9563, "step": 780 }, { "epoch": 0.324883654421132, "grad_norm": 1.2947766780853271, "learning_rate": 1.9664264942369628e-05, "loss": 1.1529, "step": 781 }, { "epoch": 0.3252996386137327, "grad_norm": 1.1364264488220215, "learning_rate": 1.9663112787318562e-05, "loss": 1.0127, "step": 782 }, { "epoch": 0.32571562280633337, "grad_norm": 1.2422966957092285, "learning_rate": 1.9661958692574613e-05, "loss": 0.9906, "step": 783 }, { "epoch": 0.32613160699893406, "grad_norm": 1.1297751665115356, "learning_rate": 1.9660802658369445e-05, "loss": 0.8766, "step": 784 }, { "epoch": 0.3265475911915347, "grad_norm": 1.3017988204956055, "learning_rate": 1.965964468493511e-05, "loss": 1.0871, "step": 785 }, { "epoch": 0.3269635753841354, "grad_norm": 1.27329683303833, "learning_rate": 1.9658484772504058e-05, "loss": 0.9216, "step": 786 }, { "epoch": 0.3273795595767361, "grad_norm": 1.326149344444275, "learning_rate": 1.965732292130911e-05, "loss": 0.9158, "step": 787 }, { "epoch": 0.3277955437693368, "grad_norm": 1.2065016031265259, "learning_rate": 1.9656159131583493e-05, "loss": 1.0511, "step": 788 }, { "epoch": 0.3282115279619374, "grad_norm": 1.1948503255844116, "learning_rate": 1.965499340356081e-05, "loss": 1.0538, "step": 789 }, { "epoch": 0.3286275121545381, "grad_norm": 1.3039908409118652, "learning_rate": 1.9653825737475066e-05, "loss": 0.9507, "step": 790 }, { "epoch": 0.3290434963471388, "grad_norm": 1.2112514972686768, "learning_rate": 1.9652656133560645e-05, "loss": 0.9652, "step": 791 }, { "epoch": 0.3294594805397395, "grad_norm": 1.193145990371704, "learning_rate": 1.9651484592052323e-05, "loss": 0.8694, "step": 792 }, { "epoch": 0.3298754647323402, "grad_norm": 1.2356621026992798, "learning_rate": 1.9650311113185266e-05, "loss": 1.0426, "step": 793 }, { "epoch": 0.33029144892494083, "grad_norm": 1.0420007705688477, "learning_rate": 1.9649135697195035e-05, "loss": 1.0257, "step": 794 }, { "epoch": 0.3307074331175415, "grad_norm": 23.628820419311523, "learning_rate": 1.9647958344317562e-05, "loss": 1.0496, "step": 795 }, { "epoch": 0.3311234173101422, "grad_norm": 1.1461067199707031, "learning_rate": 1.964677905478919e-05, "loss": 0.8943, "step": 796 }, { "epoch": 0.3315394015027429, "grad_norm": 1.2321876287460327, "learning_rate": 1.964559782884663e-05, "loss": 1.0428, "step": 797 }, { "epoch": 0.33195538569534355, "grad_norm": 1.1652549505233765, "learning_rate": 1.9644414666726995e-05, "loss": 0.7901, "step": 798 }, { "epoch": 0.33237136988794425, "grad_norm": 1.7122852802276611, "learning_rate": 1.964322956866779e-05, "loss": 1.0509, "step": 799 }, { "epoch": 0.33278735408054494, "grad_norm": 1.1587001085281372, "learning_rate": 1.964204253490689e-05, "loss": 0.9645, "step": 800 }, { "epoch": 0.33320333827314563, "grad_norm": 1.1111230850219727, "learning_rate": 1.964085356568258e-05, "loss": 1.0516, "step": 801 }, { "epoch": 0.3336193224657463, "grad_norm": 1.1415914297103882, "learning_rate": 1.9639662661233516e-05, "loss": 0.9697, "step": 802 }, { "epoch": 0.33403530665834696, "grad_norm": 1.1062525510787964, "learning_rate": 1.963846982179876e-05, "loss": 0.9266, "step": 803 }, { "epoch": 0.33445129085094766, "grad_norm": 1.1116715669631958, "learning_rate": 1.963727504761774e-05, "loss": 0.9334, "step": 804 }, { "epoch": 0.33486727504354835, "grad_norm": 1.1290841102600098, "learning_rate": 1.9636078338930296e-05, "loss": 0.864, "step": 805 }, { "epoch": 0.33528325923614904, "grad_norm": 1.33307683467865, "learning_rate": 1.963487969597664e-05, "loss": 1.0081, "step": 806 }, { "epoch": 0.3356992434287497, "grad_norm": 1.2289434671401978, "learning_rate": 1.9633679118997376e-05, "loss": 0.9154, "step": 807 }, { "epoch": 0.3361152276213504, "grad_norm": 1.1235321760177612, "learning_rate": 1.9632476608233505e-05, "loss": 0.8238, "step": 808 }, { "epoch": 0.33653121181395107, "grad_norm": 1.2481715679168701, "learning_rate": 1.96312721639264e-05, "loss": 0.945, "step": 809 }, { "epoch": 0.33694719600655176, "grad_norm": 1.3090366125106812, "learning_rate": 1.963006578631784e-05, "loss": 1.1444, "step": 810 }, { "epoch": 0.33736318019915246, "grad_norm": 2735.837890625, "learning_rate": 1.9628857475649976e-05, "loss": 0.9015, "step": 811 }, { "epoch": 0.3377791643917531, "grad_norm": 1.284813404083252, "learning_rate": 1.9627647232165358e-05, "loss": 1.0045, "step": 812 }, { "epoch": 0.3381951485843538, "grad_norm": 1.1823112964630127, "learning_rate": 1.962643505610692e-05, "loss": 0.9789, "step": 813 }, { "epoch": 0.3386111327769545, "grad_norm": 1.3377506732940674, "learning_rate": 1.962522094771798e-05, "loss": 1.0505, "step": 814 }, { "epoch": 0.3390271169695552, "grad_norm": 1.2225475311279297, "learning_rate": 1.9624004907242256e-05, "loss": 0.9534, "step": 815 }, { "epoch": 0.3394431011621558, "grad_norm": 2.071073293685913, "learning_rate": 1.962278693492384e-05, "loss": 0.9682, "step": 816 }, { "epoch": 0.3398590853547565, "grad_norm": 1.2747551202774048, "learning_rate": 1.9621567031007218e-05, "loss": 0.9797, "step": 817 }, { "epoch": 0.3402750695473572, "grad_norm": 1.2903165817260742, "learning_rate": 1.9620345195737265e-05, "loss": 1.102, "step": 818 }, { "epoch": 0.3406910537399579, "grad_norm": 1.1185860633850098, "learning_rate": 1.961912142935924e-05, "loss": 0.9013, "step": 819 }, { "epoch": 0.3411070379325586, "grad_norm": 1.1925883293151855, "learning_rate": 1.9617895732118793e-05, "loss": 1.0135, "step": 820 }, { "epoch": 0.3415230221251592, "grad_norm": 1.2348824739456177, "learning_rate": 1.9616668104261964e-05, "loss": 1.0737, "step": 821 }, { "epoch": 0.3419390063177599, "grad_norm": 1.2074382305145264, "learning_rate": 1.9615438546035174e-05, "loss": 0.9902, "step": 822 }, { "epoch": 0.3423549905103606, "grad_norm": 1.3026390075683594, "learning_rate": 1.961420705768523e-05, "loss": 1.0553, "step": 823 }, { "epoch": 0.3427709747029613, "grad_norm": 1.2051104307174683, "learning_rate": 1.961297363945933e-05, "loss": 1.0219, "step": 824 }, { "epoch": 0.34318695889556194, "grad_norm": 1.198935627937317, "learning_rate": 1.9611738291605073e-05, "loss": 0.9737, "step": 825 }, { "epoch": 0.34360294308816264, "grad_norm": 1.2062697410583496, "learning_rate": 1.9610501014370416e-05, "loss": 0.976, "step": 826 }, { "epoch": 0.34401892728076333, "grad_norm": 1.2435572147369385, "learning_rate": 1.960926180800373e-05, "loss": 0.9593, "step": 827 }, { "epoch": 0.344434911473364, "grad_norm": 1.2692896127700806, "learning_rate": 1.9608020672753764e-05, "loss": 1.0744, "step": 828 }, { "epoch": 0.3448508956659647, "grad_norm": 1.1833693981170654, "learning_rate": 1.9606777608869648e-05, "loss": 0.9274, "step": 829 }, { "epoch": 0.34526687985856536, "grad_norm": 1.2637405395507812, "learning_rate": 1.9605532616600905e-05, "loss": 0.9354, "step": 830 }, { "epoch": 0.34568286405116605, "grad_norm": 1.175661325454712, "learning_rate": 1.9604285696197444e-05, "loss": 0.9563, "step": 831 }, { "epoch": 0.34609884824376674, "grad_norm": 1.1024503707885742, "learning_rate": 1.9603036847909562e-05, "loss": 0.8932, "step": 832 }, { "epoch": 0.34651483243636744, "grad_norm": 1.213556170463562, "learning_rate": 1.960178607198795e-05, "loss": 0.9968, "step": 833 }, { "epoch": 0.3469308166289681, "grad_norm": 1.2759294509887695, "learning_rate": 1.9600533368683664e-05, "loss": 1.0161, "step": 834 }, { "epoch": 0.34734680082156877, "grad_norm": 1.1928179264068604, "learning_rate": 1.959927873824817e-05, "loss": 0.9132, "step": 835 }, { "epoch": 0.34776278501416946, "grad_norm": 1.2492977380752563, "learning_rate": 1.9598022180933313e-05, "loss": 0.949, "step": 836 }, { "epoch": 0.34817876920677016, "grad_norm": 1.2197312116622925, "learning_rate": 1.959676369699132e-05, "loss": 0.935, "step": 837 }, { "epoch": 0.34859475339937085, "grad_norm": 1.128501057624817, "learning_rate": 1.9595503286674812e-05, "loss": 0.9545, "step": 838 }, { "epoch": 0.3490107375919715, "grad_norm": 1.3121086359024048, "learning_rate": 1.9594240950236792e-05, "loss": 1.0833, "step": 839 }, { "epoch": 0.3494267217845722, "grad_norm": 1.144476294517517, "learning_rate": 1.959297668793065e-05, "loss": 0.9307, "step": 840 }, { "epoch": 0.3498427059771729, "grad_norm": 1.2200947999954224, "learning_rate": 1.959171050001016e-05, "loss": 0.9482, "step": 841 }, { "epoch": 0.35025869016977357, "grad_norm": 1.279083013534546, "learning_rate": 1.9590442386729497e-05, "loss": 1.0603, "step": 842 }, { "epoch": 0.3506746743623742, "grad_norm": 1.1899069547653198, "learning_rate": 1.9589172348343196e-05, "loss": 1.059, "step": 843 }, { "epoch": 0.3510906585549749, "grad_norm": 1.259576678276062, "learning_rate": 1.9587900385106212e-05, "loss": 1.0495, "step": 844 }, { "epoch": 0.3515066427475756, "grad_norm": 1.3295639753341675, "learning_rate": 1.9586626497273854e-05, "loss": 1.041, "step": 845 }, { "epoch": 0.3519226269401763, "grad_norm": 1.1956877708435059, "learning_rate": 1.9585350685101834e-05, "loss": 0.9598, "step": 846 }, { "epoch": 0.352338611132777, "grad_norm": 1.2390236854553223, "learning_rate": 1.9584072948846254e-05, "loss": 1.0467, "step": 847 }, { "epoch": 0.3527545953253776, "grad_norm": 59.00429153442383, "learning_rate": 1.958279328876359e-05, "loss": 1.0265, "step": 848 }, { "epoch": 0.3531705795179783, "grad_norm": 1.128399133682251, "learning_rate": 1.9581511705110712e-05, "loss": 0.8821, "step": 849 }, { "epoch": 0.353586563710579, "grad_norm": 1.1197288036346436, "learning_rate": 1.958022819814488e-05, "loss": 0.8994, "step": 850 }, { "epoch": 0.3540025479031797, "grad_norm": 1.164614200592041, "learning_rate": 1.957894276812373e-05, "loss": 0.8956, "step": 851 }, { "epoch": 0.35441853209578034, "grad_norm": 1.2713230848312378, "learning_rate": 1.9577655415305283e-05, "loss": 1.0039, "step": 852 }, { "epoch": 0.35483451628838103, "grad_norm": 1.2868025302886963, "learning_rate": 1.957636613994796e-05, "loss": 1.1137, "step": 853 }, { "epoch": 0.3552505004809817, "grad_norm": 1.3136762380599976, "learning_rate": 1.9575074942310554e-05, "loss": 0.9198, "step": 854 }, { "epoch": 0.3556664846735824, "grad_norm": 1.1841577291488647, "learning_rate": 1.9573781822652253e-05, "loss": 0.9435, "step": 855 }, { "epoch": 0.3560824688661831, "grad_norm": 1.2193703651428223, "learning_rate": 1.9572486781232624e-05, "loss": 0.9958, "step": 856 }, { "epoch": 0.35649845305878375, "grad_norm": 1.2473218441009521, "learning_rate": 1.9571189818311625e-05, "loss": 1.0739, "step": 857 }, { "epoch": 0.35691443725138444, "grad_norm": 2.7579433917999268, "learning_rate": 1.95698909341496e-05, "loss": 1.0045, "step": 858 }, { "epoch": 0.35733042144398514, "grad_norm": 1.2771728038787842, "learning_rate": 1.9568590129007272e-05, "loss": 0.918, "step": 859 }, { "epoch": 0.35774640563658583, "grad_norm": 1.2141672372817993, "learning_rate": 1.9567287403145754e-05, "loss": 1.0474, "step": 860 }, { "epoch": 0.35816238982918647, "grad_norm": 1.3000558614730835, "learning_rate": 1.9565982756826543e-05, "loss": 1.0274, "step": 861 }, { "epoch": 0.35857837402178716, "grad_norm": 1.2166717052459717, "learning_rate": 1.9564676190311532e-05, "loss": 0.9552, "step": 862 }, { "epoch": 0.35899435821438785, "grad_norm": 1.1611398458480835, "learning_rate": 1.9563367703862976e-05, "loss": 0.8467, "step": 863 }, { "epoch": 0.35941034240698855, "grad_norm": 1.3717091083526611, "learning_rate": 1.9562057297743542e-05, "loss": 0.9997, "step": 864 }, { "epoch": 0.35982632659958924, "grad_norm": 1.30080246925354, "learning_rate": 1.9560744972216263e-05, "loss": 0.9269, "step": 865 }, { "epoch": 0.3602423107921899, "grad_norm": 1.1663786172866821, "learning_rate": 1.9559430727544567e-05, "loss": 0.8912, "step": 866 }, { "epoch": 0.3606582949847906, "grad_norm": 1.1646713018417358, "learning_rate": 1.9558114563992263e-05, "loss": 0.9715, "step": 867 }, { "epoch": 0.36107427917739127, "grad_norm": 1.1757903099060059, "learning_rate": 1.955679648182355e-05, "loss": 0.9755, "step": 868 }, { "epoch": 0.36149026336999196, "grad_norm": 1.2136168479919434, "learning_rate": 1.9555476481303003e-05, "loss": 1.0147, "step": 869 }, { "epoch": 0.3619062475625926, "grad_norm": 1.2217543125152588, "learning_rate": 1.9554154562695595e-05, "loss": 0.9196, "step": 870 }, { "epoch": 0.3623222317551933, "grad_norm": 1.2475576400756836, "learning_rate": 1.955283072626667e-05, "loss": 1.0208, "step": 871 }, { "epoch": 0.362738215947794, "grad_norm": 1.133034110069275, "learning_rate": 1.9551504972281973e-05, "loss": 0.86, "step": 872 }, { "epoch": 0.3631542001403947, "grad_norm": 1.2443410158157349, "learning_rate": 1.9550177301007617e-05, "loss": 0.9394, "step": 873 }, { "epoch": 0.36357018433299537, "grad_norm": 1.2620481252670288, "learning_rate": 1.9548847712710107e-05, "loss": 0.9498, "step": 874 }, { "epoch": 0.363986168525596, "grad_norm": 1.1980582475662231, "learning_rate": 1.954751620765634e-05, "loss": 0.8548, "step": 875 }, { "epoch": 0.3644021527181967, "grad_norm": 1.2170246839523315, "learning_rate": 1.9546182786113582e-05, "loss": 0.939, "step": 876 }, { "epoch": 0.3648181369107974, "grad_norm": 1.2926453351974487, "learning_rate": 1.9544847448349506e-05, "loss": 1.0121, "step": 877 }, { "epoch": 0.3652341211033981, "grad_norm": 1.2902415990829468, "learning_rate": 1.9543510194632143e-05, "loss": 1.0221, "step": 878 }, { "epoch": 0.36565010529599873, "grad_norm": 1.2296974658966064, "learning_rate": 1.954217102522993e-05, "loss": 1.0094, "step": 879 }, { "epoch": 0.3660660894885994, "grad_norm": 1.280307412147522, "learning_rate": 1.9540829940411683e-05, "loss": 0.9259, "step": 880 }, { "epoch": 0.3664820736812001, "grad_norm": 1.3276466131210327, "learning_rate": 1.9539486940446593e-05, "loss": 0.823, "step": 881 }, { "epoch": 0.3668980578738008, "grad_norm": 1.2468827962875366, "learning_rate": 1.9538142025604244e-05, "loss": 1.0632, "step": 882 }, { "epoch": 0.3673140420664015, "grad_norm": 1.2068740129470825, "learning_rate": 1.9536795196154608e-05, "loss": 0.9284, "step": 883 }, { "epoch": 0.36773002625900214, "grad_norm": 1.241272211074829, "learning_rate": 1.953544645236803e-05, "loss": 1.0052, "step": 884 }, { "epoch": 0.36814601045160283, "grad_norm": 1.2943323850631714, "learning_rate": 1.9534095794515254e-05, "loss": 0.9341, "step": 885 }, { "epoch": 0.3685619946442035, "grad_norm": 1.2111164331436157, "learning_rate": 1.9532743222867394e-05, "loss": 1.0168, "step": 886 }, { "epoch": 0.3689779788368042, "grad_norm": 1.3844037055969238, "learning_rate": 1.953138873769595e-05, "loss": 0.9958, "step": 887 }, { "epoch": 0.36939396302940486, "grad_norm": 1.2066082954406738, "learning_rate": 1.9530032339272818e-05, "loss": 0.8604, "step": 888 }, { "epoch": 0.36980994722200555, "grad_norm": 1.2032543420791626, "learning_rate": 1.9528674027870268e-05, "loss": 0.8834, "step": 889 }, { "epoch": 0.37022593141460625, "grad_norm": 1.2360087633132935, "learning_rate": 1.9527313803760955e-05, "loss": 0.9292, "step": 890 }, { "epoch": 0.37064191560720694, "grad_norm": 1.2032638788223267, "learning_rate": 1.9525951667217917e-05, "loss": 1.0557, "step": 891 }, { "epoch": 0.37105789979980763, "grad_norm": 1.368396520614624, "learning_rate": 1.9524587618514582e-05, "loss": 1.0121, "step": 892 }, { "epoch": 0.37147388399240827, "grad_norm": 1.2658140659332275, "learning_rate": 1.9523221657924755e-05, "loss": 0.9816, "step": 893 }, { "epoch": 0.37188986818500896, "grad_norm": 1.2096116542816162, "learning_rate": 1.9521853785722627e-05, "loss": 0.9861, "step": 894 }, { "epoch": 0.37230585237760966, "grad_norm": 1.2486974000930786, "learning_rate": 1.9520484002182777e-05, "loss": 0.9443, "step": 895 }, { "epoch": 0.37272183657021035, "grad_norm": 1.218212366104126, "learning_rate": 1.9519112307580164e-05, "loss": 0.8537, "step": 896 }, { "epoch": 0.373137820762811, "grad_norm": 1.237612247467041, "learning_rate": 1.9517738702190122e-05, "loss": 0.9417, "step": 897 }, { "epoch": 0.3735538049554117, "grad_norm": 1.3294402360916138, "learning_rate": 1.9516363186288387e-05, "loss": 1.0067, "step": 898 }, { "epoch": 0.3739697891480124, "grad_norm": 1.2722333669662476, "learning_rate": 1.9514985760151068e-05, "loss": 1.0067, "step": 899 }, { "epoch": 0.37438577334061307, "grad_norm": 1.1984546184539795, "learning_rate": 1.951360642405465e-05, "loss": 0.9556, "step": 900 }, { "epoch": 0.37480175753321376, "grad_norm": 1.2128154039382935, "learning_rate": 1.9512225178276015e-05, "loss": 1.0622, "step": 901 }, { "epoch": 0.3752177417258144, "grad_norm": 1.2769620418548584, "learning_rate": 1.9510842023092423e-05, "loss": 0.8233, "step": 902 }, { "epoch": 0.3756337259184151, "grad_norm": 1.2069389820098877, "learning_rate": 1.950945695878152e-05, "loss": 0.9753, "step": 903 }, { "epoch": 0.3760497101110158, "grad_norm": 1.1056239604949951, "learning_rate": 1.9508069985621323e-05, "loss": 0.9587, "step": 904 }, { "epoch": 0.3764656943036165, "grad_norm": 1.2392305135726929, "learning_rate": 1.950668110389025e-05, "loss": 0.8916, "step": 905 }, { "epoch": 0.3768816784962171, "grad_norm": 1.307966947555542, "learning_rate": 1.9505290313867093e-05, "loss": 0.9598, "step": 906 }, { "epoch": 0.3772976626888178, "grad_norm": 1.2371041774749756, "learning_rate": 1.950389761583102e-05, "loss": 1.0704, "step": 907 }, { "epoch": 0.3777136468814185, "grad_norm": 1.2304229736328125, "learning_rate": 1.95025030100616e-05, "loss": 0.9495, "step": 908 }, { "epoch": 0.3781296310740192, "grad_norm": 1.3219233751296997, "learning_rate": 1.9501106496838768e-05, "loss": 0.9957, "step": 909 }, { "epoch": 0.3785456152666199, "grad_norm": 1.2107397317886353, "learning_rate": 1.949970807644285e-05, "loss": 1.0254, "step": 910 }, { "epoch": 0.37896159945922053, "grad_norm": 1.3583800792694092, "learning_rate": 1.949830774915455e-05, "loss": 1.0259, "step": 911 }, { "epoch": 0.3793775836518212, "grad_norm": 1.228307843208313, "learning_rate": 1.9496905515254963e-05, "loss": 0.9683, "step": 912 }, { "epoch": 0.3797935678444219, "grad_norm": 1.2978886365890503, "learning_rate": 1.949550137502556e-05, "loss": 0.9276, "step": 913 }, { "epoch": 0.3802095520370226, "grad_norm": 1.2447502613067627, "learning_rate": 1.9494095328748198e-05, "loss": 0.8952, "step": 914 }, { "epoch": 0.38062553622962325, "grad_norm": 1.241921305656433, "learning_rate": 1.9492687376705115e-05, "loss": 0.9431, "step": 915 }, { "epoch": 0.38104152042222394, "grad_norm": 1.260749340057373, "learning_rate": 1.949127751917893e-05, "loss": 1.0562, "step": 916 }, { "epoch": 0.38145750461482464, "grad_norm": 1.2847061157226562, "learning_rate": 1.9489865756452642e-05, "loss": 0.896, "step": 917 }, { "epoch": 0.38187348880742533, "grad_norm": 1.2548872232437134, "learning_rate": 1.9488452088809643e-05, "loss": 0.9602, "step": 918 }, { "epoch": 0.382289473000026, "grad_norm": 1.2443233728408813, "learning_rate": 1.94870365165337e-05, "loss": 1.0927, "step": 919 }, { "epoch": 0.38270545719262666, "grad_norm": 1.239775538444519, "learning_rate": 1.9485619039908957e-05, "loss": 0.9342, "step": 920 }, { "epoch": 0.38312144138522736, "grad_norm": 1.175215244293213, "learning_rate": 1.9484199659219952e-05, "loss": 0.8632, "step": 921 }, { "epoch": 0.38353742557782805, "grad_norm": 1.2419265508651733, "learning_rate": 1.9482778374751607e-05, "loss": 0.9319, "step": 922 }, { "epoch": 0.38395340977042874, "grad_norm": 1.3561224937438965, "learning_rate": 1.9481355186789203e-05, "loss": 0.9751, "step": 923 }, { "epoch": 0.3843693939630294, "grad_norm": 1.437645435333252, "learning_rate": 1.9479930095618427e-05, "loss": 0.9984, "step": 924 }, { "epoch": 0.3847853781556301, "grad_norm": 1.3692928552627563, "learning_rate": 1.947850310152534e-05, "loss": 1.0297, "step": 925 }, { "epoch": 0.38520136234823077, "grad_norm": 1.2556082010269165, "learning_rate": 1.9477074204796387e-05, "loss": 1.0089, "step": 926 }, { "epoch": 0.38561734654083146, "grad_norm": 1.277631163597107, "learning_rate": 1.9475643405718388e-05, "loss": 0.9377, "step": 927 }, { "epoch": 0.38603333073343216, "grad_norm": 1.2691466808319092, "learning_rate": 1.9474210704578555e-05, "loss": 0.9581, "step": 928 }, { "epoch": 0.3864493149260328, "grad_norm": 1.1453957557678223, "learning_rate": 1.947277610166447e-05, "loss": 0.9499, "step": 929 }, { "epoch": 0.3868652991186335, "grad_norm": 1.3081674575805664, "learning_rate": 1.9471339597264108e-05, "loss": 0.9175, "step": 930 }, { "epoch": 0.3872812833112342, "grad_norm": 1.2189959287643433, "learning_rate": 1.946990119166582e-05, "loss": 1.0186, "step": 931 }, { "epoch": 0.3876972675038349, "grad_norm": 1.347998857498169, "learning_rate": 1.9468460885158343e-05, "loss": 0.9901, "step": 932 }, { "epoch": 0.3881132516964355, "grad_norm": 1.151504635810852, "learning_rate": 1.9467018678030783e-05, "loss": 0.9299, "step": 933 }, { "epoch": 0.3885292358890362, "grad_norm": 1.383777379989624, "learning_rate": 1.9465574570572646e-05, "loss": 0.9788, "step": 934 }, { "epoch": 0.3889452200816369, "grad_norm": 1.2582793235778809, "learning_rate": 1.9464128563073804e-05, "loss": 0.9799, "step": 935 }, { "epoch": 0.3893612042742376, "grad_norm": 1.1059362888336182, "learning_rate": 1.946268065582452e-05, "loss": 0.8687, "step": 936 }, { "epoch": 0.3897771884668383, "grad_norm": 1.1474480628967285, "learning_rate": 1.946123084911543e-05, "loss": 0.8894, "step": 937 }, { "epoch": 0.3901931726594389, "grad_norm": 1.2985451221466064, "learning_rate": 1.9459779143237566e-05, "loss": 0.907, "step": 938 }, { "epoch": 0.3906091568520396, "grad_norm": 1.327898383140564, "learning_rate": 1.945832553848232e-05, "loss": 0.9475, "step": 939 }, { "epoch": 0.3910251410446403, "grad_norm": 1.3083261251449585, "learning_rate": 1.945687003514148e-05, "loss": 1.041, "step": 940 }, { "epoch": 0.391441125237241, "grad_norm": 1.2869733572006226, "learning_rate": 1.9455412633507217e-05, "loss": 1.0173, "step": 941 }, { "epoch": 0.39185710942984164, "grad_norm": 1.358020544052124, "learning_rate": 1.945395333387208e-05, "loss": 1.0167, "step": 942 }, { "epoch": 0.39227309362244234, "grad_norm": 1.3464510440826416, "learning_rate": 1.945249213652898e-05, "loss": 1.0223, "step": 943 }, { "epoch": 0.39268907781504303, "grad_norm": 26.25218391418457, "learning_rate": 1.945102904177124e-05, "loss": 1.0442, "step": 944 }, { "epoch": 0.3931050620076437, "grad_norm": 1.4006154537200928, "learning_rate": 1.9449564049892547e-05, "loss": 0.9724, "step": 945 }, { "epoch": 0.3935210462002444, "grad_norm": 1.339440941810608, "learning_rate": 1.944809716118697e-05, "loss": 0.995, "step": 946 }, { "epoch": 0.39393703039284506, "grad_norm": 1.2373071908950806, "learning_rate": 1.9446628375948958e-05, "loss": 0.7825, "step": 947 }, { "epoch": 0.39435301458544575, "grad_norm": 1.354790210723877, "learning_rate": 1.944515769447335e-05, "loss": 1.0103, "step": 948 }, { "epoch": 0.39476899877804644, "grad_norm": 1.2114406824111938, "learning_rate": 1.944368511705535e-05, "loss": 1.0082, "step": 949 }, { "epoch": 0.39518498297064714, "grad_norm": 1.4390913248062134, "learning_rate": 1.944221064399055e-05, "loss": 0.9449, "step": 950 }, { "epoch": 0.3956009671632478, "grad_norm": 1.2783401012420654, "learning_rate": 1.944073427557493e-05, "loss": 0.8847, "step": 951 }, { "epoch": 0.39601695135584847, "grad_norm": 1.2864476442337036, "learning_rate": 1.9439256012104845e-05, "loss": 1.0225, "step": 952 }, { "epoch": 0.39643293554844916, "grad_norm": 1.1924494504928589, "learning_rate": 1.9437775853877025e-05, "loss": 0.9626, "step": 953 }, { "epoch": 0.39684891974104985, "grad_norm": 1.136006236076355, "learning_rate": 1.943629380118858e-05, "loss": 0.7959, "step": 954 }, { "epoch": 0.39726490393365055, "grad_norm": 1.2896759510040283, "learning_rate": 1.9434809854337014e-05, "loss": 0.9333, "step": 955 }, { "epoch": 0.3976808881262512, "grad_norm": 30.65985870361328, "learning_rate": 1.94333240136202e-05, "loss": 0.9385, "step": 956 }, { "epoch": 0.3980968723188519, "grad_norm": 74.17255401611328, "learning_rate": 1.943183627933639e-05, "loss": 1.0033, "step": 957 }, { "epoch": 0.3985128565114526, "grad_norm": 1.3728070259094238, "learning_rate": 1.943034665178422e-05, "loss": 1.0232, "step": 958 }, { "epoch": 0.39892884070405327, "grad_norm": 1.4561904668807983, "learning_rate": 1.942885513126271e-05, "loss": 0.9953, "step": 959 }, { "epoch": 0.3993448248966539, "grad_norm": 1.2768393754959106, "learning_rate": 1.942736171807125e-05, "loss": 0.9795, "step": 960 }, { "epoch": 0.3997608090892546, "grad_norm": 1.4257733821868896, "learning_rate": 1.9425866412509616e-05, "loss": 0.9787, "step": 961 }, { "epoch": 0.4001767932818553, "grad_norm": 1.3676525354385376, "learning_rate": 1.9424369214877966e-05, "loss": 0.9758, "step": 962 }, { "epoch": 0.400592777474456, "grad_norm": 1.2595210075378418, "learning_rate": 1.9422870125476833e-05, "loss": 0.9832, "step": 963 }, { "epoch": 0.4010087616670567, "grad_norm": 1.2375168800354004, "learning_rate": 1.9421369144607137e-05, "loss": 0.9283, "step": 964 }, { "epoch": 0.4014247458596573, "grad_norm": 1.3047516345977783, "learning_rate": 1.941986627257016e-05, "loss": 0.9029, "step": 965 }, { "epoch": 0.401840730052258, "grad_norm": 2.8805418014526367, "learning_rate": 1.941836150966759e-05, "loss": 0.9268, "step": 966 }, { "epoch": 0.4022567142448587, "grad_norm": 1.176392912864685, "learning_rate": 1.941685485620147e-05, "loss": 0.9652, "step": 967 }, { "epoch": 0.4026726984374594, "grad_norm": 1.2498983144760132, "learning_rate": 1.941534631247424e-05, "loss": 0.9745, "step": 968 }, { "epoch": 0.40308868263006004, "grad_norm": 1.191744089126587, "learning_rate": 1.941383587878871e-05, "loss": 1.0884, "step": 969 }, { "epoch": 0.40350466682266073, "grad_norm": 1.3340320587158203, "learning_rate": 1.9412323555448072e-05, "loss": 0.9637, "step": 970 }, { "epoch": 0.4039206510152614, "grad_norm": 1.3872419595718384, "learning_rate": 1.94108093427559e-05, "loss": 1.0249, "step": 971 }, { "epoch": 0.4043366352078621, "grad_norm": 1.3924168348312378, "learning_rate": 1.9409293241016135e-05, "loss": 1.0041, "step": 972 }, { "epoch": 0.4047526194004628, "grad_norm": 1.3093663454055786, "learning_rate": 1.9407775250533117e-05, "loss": 1.0021, "step": 973 }, { "epoch": 0.40516860359306345, "grad_norm": 1.3195446729660034, "learning_rate": 1.940625537161155e-05, "loss": 0.852, "step": 974 }, { "epoch": 0.40558458778566414, "grad_norm": 1.3339755535125732, "learning_rate": 1.9404733604556526e-05, "loss": 1.0879, "step": 975 }, { "epoch": 0.40600057197826483, "grad_norm": 1.2914485931396484, "learning_rate": 1.9403209949673502e-05, "loss": 0.9411, "step": 976 }, { "epoch": 0.40641655617086553, "grad_norm": 1.2140517234802246, "learning_rate": 1.9401684407268336e-05, "loss": 0.9782, "step": 977 }, { "epoch": 0.40683254036346617, "grad_norm": 1.2487874031066895, "learning_rate": 1.9400156977647248e-05, "loss": 0.9372, "step": 978 }, { "epoch": 0.40724852455606686, "grad_norm": 1.2295305728912354, "learning_rate": 1.9398627661116837e-05, "loss": 1.0133, "step": 979 }, { "epoch": 0.40766450874866755, "grad_norm": 1.4016673564910889, "learning_rate": 1.9397096457984087e-05, "loss": 0.9507, "step": 980 }, { "epoch": 0.40808049294126825, "grad_norm": 1.1339938640594482, "learning_rate": 1.9395563368556365e-05, "loss": 0.8656, "step": 981 }, { "epoch": 0.40849647713386894, "grad_norm": 1.2804237604141235, "learning_rate": 1.9394028393141406e-05, "loss": 0.9796, "step": 982 }, { "epoch": 0.4089124613264696, "grad_norm": 1.1857398748397827, "learning_rate": 1.9392491532047327e-05, "loss": 0.961, "step": 983 }, { "epoch": 0.40932844551907027, "grad_norm": 1.255814790725708, "learning_rate": 1.939095278558263e-05, "loss": 0.8939, "step": 984 }, { "epoch": 0.40974442971167097, "grad_norm": 1.2279170751571655, "learning_rate": 1.9389412154056188e-05, "loss": 0.8855, "step": 985 }, { "epoch": 0.41016041390427166, "grad_norm": 1.3452367782592773, "learning_rate": 1.938786963777725e-05, "loss": 0.8902, "step": 986 }, { "epoch": 0.4105763980968723, "grad_norm": 1.2783154249191284, "learning_rate": 1.9386325237055448e-05, "loss": 1.0011, "step": 987 }, { "epoch": 0.410992382289473, "grad_norm": 1.315933108329773, "learning_rate": 1.93847789522008e-05, "loss": 0.886, "step": 988 }, { "epoch": 0.4114083664820737, "grad_norm": 1.2882351875305176, "learning_rate": 1.9383230783523687e-05, "loss": 0.8759, "step": 989 }, { "epoch": 0.4118243506746744, "grad_norm": 1.395713210105896, "learning_rate": 1.938168073133488e-05, "loss": 0.9771, "step": 990 }, { "epoch": 0.41224033486727507, "grad_norm": 1.6765552759170532, "learning_rate": 1.938012879594552e-05, "loss": 1.0085, "step": 991 }, { "epoch": 0.4126563190598757, "grad_norm": 1.3468044996261597, "learning_rate": 1.9378574977667132e-05, "loss": 0.9088, "step": 992 }, { "epoch": 0.4130723032524764, "grad_norm": 1.3498116731643677, "learning_rate": 1.9377019276811614e-05, "loss": 0.9782, "step": 993 }, { "epoch": 0.4134882874450771, "grad_norm": 1.3857941627502441, "learning_rate": 1.937546169369125e-05, "loss": 1.0373, "step": 994 }, { "epoch": 0.4139042716376778, "grad_norm": 1.3384225368499756, "learning_rate": 1.937390222861869e-05, "loss": 0.901, "step": 995 }, { "epoch": 0.4143202558302784, "grad_norm": 1.275437831878662, "learning_rate": 1.937234088190697e-05, "loss": 0.9697, "step": 996 }, { "epoch": 0.4147362400228791, "grad_norm": 1.2823313474655151, "learning_rate": 1.9370777653869508e-05, "loss": 0.956, "step": 997 }, { "epoch": 0.4151522242154798, "grad_norm": 1.3041821718215942, "learning_rate": 1.9369212544820083e-05, "loss": 0.9168, "step": 998 }, { "epoch": 0.4155682084080805, "grad_norm": 1.259955644607544, "learning_rate": 1.9367645555072868e-05, "loss": 0.9338, "step": 999 }, { "epoch": 0.4159841926006812, "grad_norm": 1.4265719652175903, "learning_rate": 1.9366076684942407e-05, "loss": 0.9947, "step": 1000 }, { "epoch": 0.4159841926006812, "eval_loss": 0.876042366027832, "eval_runtime": 1663.1312, "eval_samples_per_second": 3.963, "eval_steps_per_second": 1.982, "step": 1000 }, { "epoch": 0.41640017679328184, "grad_norm": 690.1052856445312, "learning_rate": 1.9364505934743617e-05, "loss": 0.9989, "step": 1001 }, { "epoch": 0.41681616098588253, "grad_norm": 1.321067452430725, "learning_rate": 1.9362933304791807e-05, "loss": 0.8905, "step": 1002 }, { "epoch": 0.4172321451784832, "grad_norm": 1.1844286918640137, "learning_rate": 1.9361358795402646e-05, "loss": 0.7891, "step": 1003 }, { "epoch": 0.4176481293710839, "grad_norm": 1.3209913969039917, "learning_rate": 1.9359782406892186e-05, "loss": 0.872, "step": 1004 }, { "epoch": 0.41806411356368456, "grad_norm": 1.466092586517334, "learning_rate": 1.9358204139576865e-05, "loss": 1.0354, "step": 1005 }, { "epoch": 0.41848009775628525, "grad_norm": 1.2207581996917725, "learning_rate": 1.935662399377349e-05, "loss": 0.9338, "step": 1006 }, { "epoch": 0.41889608194888595, "grad_norm": 1.4143112897872925, "learning_rate": 1.9355041969799245e-05, "loss": 1.0107, "step": 1007 }, { "epoch": 0.41931206614148664, "grad_norm": 1.344908595085144, "learning_rate": 1.9353458067971692e-05, "loss": 1.0181, "step": 1008 }, { "epoch": 0.4197280503340873, "grad_norm": 1.407238245010376, "learning_rate": 1.935187228860877e-05, "loss": 0.9844, "step": 1009 }, { "epoch": 0.42014403452668797, "grad_norm": 1.2864211797714233, "learning_rate": 1.9350284632028795e-05, "loss": 0.7651, "step": 1010 }, { "epoch": 0.42056001871928866, "grad_norm": 1.200555682182312, "learning_rate": 1.934869509855046e-05, "loss": 0.7687, "step": 1011 }, { "epoch": 0.42097600291188936, "grad_norm": 1.3754500150680542, "learning_rate": 1.9347103688492836e-05, "loss": 0.9941, "step": 1012 }, { "epoch": 0.42139198710449005, "grad_norm": 1.3063589334487915, "learning_rate": 1.934551040217537e-05, "loss": 1.0607, "step": 1013 }, { "epoch": 0.4218079712970907, "grad_norm": 1.2835016250610352, "learning_rate": 1.934391523991788e-05, "loss": 0.9228, "step": 1014 }, { "epoch": 0.4222239554896914, "grad_norm": 1.3482011556625366, "learning_rate": 1.9342318202040572e-05, "loss": 1.0926, "step": 1015 }, { "epoch": 0.4226399396822921, "grad_norm": 1.371829867362976, "learning_rate": 1.934071928886402e-05, "loss": 0.9507, "step": 1016 }, { "epoch": 0.42305592387489277, "grad_norm": 1.2864675521850586, "learning_rate": 1.9339118500709176e-05, "loss": 0.9239, "step": 1017 }, { "epoch": 0.4234719080674934, "grad_norm": 1.2553811073303223, "learning_rate": 1.933751583789737e-05, "loss": 1.0477, "step": 1018 }, { "epoch": 0.4238878922600941, "grad_norm": 1.4295814037322998, "learning_rate": 1.9335911300750302e-05, "loss": 0.9856, "step": 1019 }, { "epoch": 0.4243038764526948, "grad_norm": 1.2618496417999268, "learning_rate": 1.9334304889590058e-05, "loss": 0.9049, "step": 1020 }, { "epoch": 0.4247198606452955, "grad_norm": 1.2319276332855225, "learning_rate": 1.9332696604739096e-05, "loss": 1.0285, "step": 1021 }, { "epoch": 0.4251358448378962, "grad_norm": 1.332377552986145, "learning_rate": 1.9331086446520252e-05, "loss": 0.9124, "step": 1022 }, { "epoch": 0.4255518290304968, "grad_norm": 1.2640204429626465, "learning_rate": 1.9329474415256727e-05, "loss": 1.011, "step": 1023 }, { "epoch": 0.4259678132230975, "grad_norm": 1.3124492168426514, "learning_rate": 1.9327860511272115e-05, "loss": 1.0027, "step": 1024 }, { "epoch": 0.4263837974156982, "grad_norm": 1.281412959098816, "learning_rate": 1.9326244734890374e-05, "loss": 0.9455, "step": 1025 }, { "epoch": 0.4267997816082989, "grad_norm": 1.3500958681106567, "learning_rate": 1.9324627086435845e-05, "loss": 1.0282, "step": 1026 }, { "epoch": 0.42721576580089954, "grad_norm": 1.2490442991256714, "learning_rate": 1.9323007566233238e-05, "loss": 0.9066, "step": 1027 }, { "epoch": 0.42763174999350023, "grad_norm": 1.3835549354553223, "learning_rate": 1.9321386174607642e-05, "loss": 1.0647, "step": 1028 }, { "epoch": 0.4280477341861009, "grad_norm": 1.3141168355941772, "learning_rate": 1.9319762911884524e-05, "loss": 1.0336, "step": 1029 }, { "epoch": 0.4284637183787016, "grad_norm": 1.389320969581604, "learning_rate": 1.9318137778389724e-05, "loss": 1.0018, "step": 1030 }, { "epoch": 0.4288797025713023, "grad_norm": 1.3693548440933228, "learning_rate": 1.9316510774449452e-05, "loss": 1.0714, "step": 1031 }, { "epoch": 0.42929568676390295, "grad_norm": 1.2483333349227905, "learning_rate": 1.931488190039031e-05, "loss": 1.0194, "step": 1032 }, { "epoch": 0.42971167095650364, "grad_norm": 1.2068318128585815, "learning_rate": 1.931325115653925e-05, "loss": 0.9492, "step": 1033 }, { "epoch": 0.43012765514910434, "grad_norm": 1.3573040962219238, "learning_rate": 1.9311618543223628e-05, "loss": 0.9807, "step": 1034 }, { "epoch": 0.43054363934170503, "grad_norm": 1.2498337030410767, "learning_rate": 1.9309984060771154e-05, "loss": 1.0024, "step": 1035 }, { "epoch": 0.43095962353430567, "grad_norm": 1.303828477859497, "learning_rate": 1.930834770950992e-05, "loss": 0.8573, "step": 1036 }, { "epoch": 0.43137560772690636, "grad_norm": 1.3577395677566528, "learning_rate": 1.93067094897684e-05, "loss": 1.0301, "step": 1037 }, { "epoch": 0.43179159191950706, "grad_norm": 1.2601711750030518, "learning_rate": 1.930506940187543e-05, "loss": 0.8706, "step": 1038 }, { "epoch": 0.43220757611210775, "grad_norm": 1.2963180541992188, "learning_rate": 1.930342744616023e-05, "loss": 0.8716, "step": 1039 }, { "epoch": 0.43262356030470844, "grad_norm": 1.3480446338653564, "learning_rate": 1.930178362295239e-05, "loss": 1.112, "step": 1040 }, { "epoch": 0.4330395444973091, "grad_norm": 1.353276014328003, "learning_rate": 1.9300137932581882e-05, "loss": 0.959, "step": 1041 }, { "epoch": 0.4334555286899098, "grad_norm": 1.33724045753479, "learning_rate": 1.9298490375379043e-05, "loss": 0.9366, "step": 1042 }, { "epoch": 0.43387151288251047, "grad_norm": 1.2926040887832642, "learning_rate": 1.92968409516746e-05, "loss": 0.8861, "step": 1043 }, { "epoch": 0.43428749707511116, "grad_norm": 1.309658169746399, "learning_rate": 1.929518966179963e-05, "loss": 0.8599, "step": 1044 }, { "epoch": 0.4347034812677118, "grad_norm": 1.2939854860305786, "learning_rate": 1.9293536506085607e-05, "loss": 0.8829, "step": 1045 }, { "epoch": 0.4351194654603125, "grad_norm": 1.257999300956726, "learning_rate": 1.9291881484864375e-05, "loss": 0.9618, "step": 1046 }, { "epoch": 0.4355354496529132, "grad_norm": 1.3168600797653198, "learning_rate": 1.9290224598468143e-05, "loss": 0.9908, "step": 1047 }, { "epoch": 0.4359514338455139, "grad_norm": 1.3504886627197266, "learning_rate": 1.9288565847229504e-05, "loss": 0.9783, "step": 1048 }, { "epoch": 0.4363674180381146, "grad_norm": 1.3859399557113647, "learning_rate": 1.928690523148142e-05, "loss": 0.9257, "step": 1049 }, { "epoch": 0.4367834022307152, "grad_norm": 1.2433223724365234, "learning_rate": 1.928524275155723e-05, "loss": 0.8466, "step": 1050 }, { "epoch": 0.4371993864233159, "grad_norm": 1.3621876239776611, "learning_rate": 1.928357840779065e-05, "loss": 0.9258, "step": 1051 }, { "epoch": 0.4376153706159166, "grad_norm": 441.50714111328125, "learning_rate": 1.928191220051576e-05, "loss": 1.0699, "step": 1052 }, { "epoch": 0.4380313548085173, "grad_norm": 1.4406864643096924, "learning_rate": 1.9280244130067024e-05, "loss": 0.9399, "step": 1053 }, { "epoch": 0.43844733900111793, "grad_norm": 1.3395252227783203, "learning_rate": 1.9278574196779277e-05, "loss": 0.844, "step": 1054 }, { "epoch": 0.4388633231937186, "grad_norm": 1.4136559963226318, "learning_rate": 1.9276902400987725e-05, "loss": 0.9335, "step": 1055 }, { "epoch": 0.4392793073863193, "grad_norm": 1.3330881595611572, "learning_rate": 1.9275228743027955e-05, "loss": 1.0359, "step": 1056 }, { "epoch": 0.43969529157892, "grad_norm": 1.2203049659729004, "learning_rate": 1.927355322323592e-05, "loss": 0.8237, "step": 1057 }, { "epoch": 0.4401112757715207, "grad_norm": 1.4034698009490967, "learning_rate": 1.927187584194795e-05, "loss": 0.9092, "step": 1058 }, { "epoch": 0.44052725996412134, "grad_norm": 1.246524691581726, "learning_rate": 1.9270196599500744e-05, "loss": 0.9536, "step": 1059 }, { "epoch": 0.44094324415672204, "grad_norm": 1.320143461227417, "learning_rate": 1.926851549623139e-05, "loss": 0.8973, "step": 1060 }, { "epoch": 0.44135922834932273, "grad_norm": 1.4415321350097656, "learning_rate": 1.9266832532477332e-05, "loss": 0.9099, "step": 1061 }, { "epoch": 0.4417752125419234, "grad_norm": 1.3974111080169678, "learning_rate": 1.9265147708576394e-05, "loss": 0.9346, "step": 1062 }, { "epoch": 0.44219119673452406, "grad_norm": 1.3307759761810303, "learning_rate": 1.9263461024866773e-05, "loss": 0.9024, "step": 1063 }, { "epoch": 0.44260718092712475, "grad_norm": 1.2740286588668823, "learning_rate": 1.926177248168704e-05, "loss": 0.9955, "step": 1064 }, { "epoch": 0.44302316511972545, "grad_norm": 1.2170672416687012, "learning_rate": 1.9260082079376145e-05, "loss": 0.8157, "step": 1065 }, { "epoch": 0.44343914931232614, "grad_norm": 1.5251185894012451, "learning_rate": 1.92583898182734e-05, "loss": 0.9864, "step": 1066 }, { "epoch": 0.44385513350492684, "grad_norm": 1.3269374370574951, "learning_rate": 1.925669569871849e-05, "loss": 0.9564, "step": 1067 }, { "epoch": 0.4442711176975275, "grad_norm": 1.3752257823944092, "learning_rate": 1.925499972105149e-05, "loss": 0.9056, "step": 1068 }, { "epoch": 0.44468710189012817, "grad_norm": 1.363873839378357, "learning_rate": 1.9253301885612828e-05, "loss": 1.0247, "step": 1069 }, { "epoch": 0.44510308608272886, "grad_norm": 1.2516567707061768, "learning_rate": 1.925160219274332e-05, "loss": 0.8626, "step": 1070 }, { "epoch": 0.44551907027532955, "grad_norm": 261.1200256347656, "learning_rate": 1.9249900642784142e-05, "loss": 0.8812, "step": 1071 }, { "epoch": 0.4459350544679302, "grad_norm": 1.2913014888763428, "learning_rate": 1.9248197236076852e-05, "loss": 0.9912, "step": 1072 }, { "epoch": 0.4463510386605309, "grad_norm": 4001.80224609375, "learning_rate": 1.9246491972963377e-05, "loss": 0.9447, "step": 1073 }, { "epoch": 0.4467670228531316, "grad_norm": 1.210842490196228, "learning_rate": 1.9244784853786014e-05, "loss": 0.9552, "step": 1074 }, { "epoch": 0.4471830070457323, "grad_norm": 1.2233834266662598, "learning_rate": 1.9243075878887444e-05, "loss": 0.7931, "step": 1075 }, { "epoch": 0.44759899123833297, "grad_norm": 1.303823471069336, "learning_rate": 1.924136504861071e-05, "loss": 0.9612, "step": 1076 }, { "epoch": 0.4480149754309336, "grad_norm": 1.4648082256317139, "learning_rate": 1.9239652363299224e-05, "loss": 0.9738, "step": 1077 }, { "epoch": 0.4484309596235343, "grad_norm": 1.3540860414505005, "learning_rate": 1.923793782329678e-05, "loss": 1.0104, "step": 1078 }, { "epoch": 0.448846943816135, "grad_norm": 1.438119888305664, "learning_rate": 1.923622142894754e-05, "loss": 1.0229, "step": 1079 }, { "epoch": 0.4492629280087357, "grad_norm": 1.4977909326553345, "learning_rate": 1.923450318059604e-05, "loss": 0.9742, "step": 1080 }, { "epoch": 0.4496789122013363, "grad_norm": 1.3892918825149536, "learning_rate": 1.9232783078587185e-05, "loss": 0.9801, "step": 1081 }, { "epoch": 0.450094896393937, "grad_norm": 1.3471951484680176, "learning_rate": 1.9231061123266258e-05, "loss": 0.9106, "step": 1082 }, { "epoch": 0.4505108805865377, "grad_norm": 1.1917730569839478, "learning_rate": 1.9229337314978905e-05, "loss": 0.9221, "step": 1083 }, { "epoch": 0.4509268647791384, "grad_norm": 1.3931549787521362, "learning_rate": 1.9227611654071146e-05, "loss": 0.9219, "step": 1084 }, { "epoch": 0.4513428489717391, "grad_norm": 1.2729346752166748, "learning_rate": 1.922588414088939e-05, "loss": 0.948, "step": 1085 }, { "epoch": 0.45175883316433973, "grad_norm": 1.3608040809631348, "learning_rate": 1.922415477578039e-05, "loss": 0.9401, "step": 1086 }, { "epoch": 0.45217481735694043, "grad_norm": 1.384126901626587, "learning_rate": 1.9222423559091287e-05, "loss": 0.9558, "step": 1087 }, { "epoch": 0.4525908015495411, "grad_norm": 1.2937551736831665, "learning_rate": 1.9220690491169595e-05, "loss": 0.9262, "step": 1088 }, { "epoch": 0.4530067857421418, "grad_norm": 1.3494963645935059, "learning_rate": 1.9218955572363192e-05, "loss": 0.9848, "step": 1089 }, { "epoch": 0.45342276993474245, "grad_norm": 33.1793327331543, "learning_rate": 1.9217218803020335e-05, "loss": 0.9194, "step": 1090 }, { "epoch": 0.45383875412734315, "grad_norm": 1.3022544384002686, "learning_rate": 1.921548018348964e-05, "loss": 0.9595, "step": 1091 }, { "epoch": 0.45425473831994384, "grad_norm": 1.2890968322753906, "learning_rate": 1.9213739714120114e-05, "loss": 1.0094, "step": 1092 }, { "epoch": 0.45467072251254453, "grad_norm": 1.3255194425582886, "learning_rate": 1.921199739526112e-05, "loss": 0.9255, "step": 1093 }, { "epoch": 0.4550867067051452, "grad_norm": 1.4401986598968506, "learning_rate": 1.9210253227262395e-05, "loss": 1.0127, "step": 1094 }, { "epoch": 0.45550269089774587, "grad_norm": 1.2549524307250977, "learning_rate": 1.9208507210474054e-05, "loss": 0.9878, "step": 1095 }, { "epoch": 0.45591867509034656, "grad_norm": 1.358559250831604, "learning_rate": 1.9206759345246565e-05, "loss": 0.9349, "step": 1096 }, { "epoch": 0.45633465928294725, "grad_norm": 1.2851943969726562, "learning_rate": 1.9205009631930795e-05, "loss": 0.9641, "step": 1097 }, { "epoch": 0.45675064347554795, "grad_norm": 1.220712661743164, "learning_rate": 1.920325807087796e-05, "loss": 0.8838, "step": 1098 }, { "epoch": 0.4571666276681486, "grad_norm": 1.321742057800293, "learning_rate": 1.9201504662439653e-05, "loss": 0.9471, "step": 1099 }, { "epoch": 0.4575826118607493, "grad_norm": 1.3898475170135498, "learning_rate": 1.919974940696784e-05, "loss": 1.0902, "step": 1100 }, { "epoch": 0.45799859605334997, "grad_norm": 77.18681335449219, "learning_rate": 1.9197992304814858e-05, "loss": 0.8237, "step": 1101 }, { "epoch": 0.45841458024595066, "grad_norm": 1.3247703313827515, "learning_rate": 1.919623335633341e-05, "loss": 1.0139, "step": 1102 }, { "epoch": 0.45883056443855136, "grad_norm": 1.454421043395996, "learning_rate": 1.9194472561876572e-05, "loss": 0.9393, "step": 1103 }, { "epoch": 0.459246548631152, "grad_norm": 1.42146897315979, "learning_rate": 1.91927099217978e-05, "loss": 0.9254, "step": 1104 }, { "epoch": 0.4596625328237527, "grad_norm": 1.1488977670669556, "learning_rate": 1.91909454364509e-05, "loss": 0.77, "step": 1105 }, { "epoch": 0.4600785170163534, "grad_norm": 1.3553240299224854, "learning_rate": 1.9189179106190066e-05, "loss": 0.9947, "step": 1106 }, { "epoch": 0.4604945012089541, "grad_norm": 1.4486043453216553, "learning_rate": 1.9187410931369855e-05, "loss": 1.0841, "step": 1107 }, { "epoch": 0.4609104854015547, "grad_norm": 1.2992154359817505, "learning_rate": 1.91856409123452e-05, "loss": 0.9869, "step": 1108 }, { "epoch": 0.4613264695941554, "grad_norm": 1.3679198026657104, "learning_rate": 1.9183869049471396e-05, "loss": 0.8968, "step": 1109 }, { "epoch": 0.4617424537867561, "grad_norm": 1.279668927192688, "learning_rate": 1.918209534310411e-05, "loss": 0.8536, "step": 1110 }, { "epoch": 0.4621584379793568, "grad_norm": 1.3440452814102173, "learning_rate": 1.9180319793599383e-05, "loss": 0.9544, "step": 1111 }, { "epoch": 0.4625744221719575, "grad_norm": 1.3729469776153564, "learning_rate": 1.9178542401313624e-05, "loss": 0.9556, "step": 1112 }, { "epoch": 0.4629904063645581, "grad_norm": 2.7053375244140625, "learning_rate": 1.9176763166603613e-05, "loss": 0.9701, "step": 1113 }, { "epoch": 0.4634063905571588, "grad_norm": 1.2219696044921875, "learning_rate": 1.9174982089826498e-05, "loss": 0.8724, "step": 1114 }, { "epoch": 0.4638223747497595, "grad_norm": 1.4122004508972168, "learning_rate": 1.9173199171339795e-05, "loss": 0.9444, "step": 1115 }, { "epoch": 0.4642383589423602, "grad_norm": 1.4474482536315918, "learning_rate": 1.9171414411501402e-05, "loss": 0.9983, "step": 1116 }, { "epoch": 0.46465434313496085, "grad_norm": 1.3851759433746338, "learning_rate": 1.9169627810669563e-05, "loss": 0.9683, "step": 1117 }, { "epoch": 0.46507032732756154, "grad_norm": 1.2648279666900635, "learning_rate": 1.9167839369202915e-05, "loss": 0.909, "step": 1118 }, { "epoch": 0.46548631152016223, "grad_norm": 7.221245288848877, "learning_rate": 1.916604908746045e-05, "loss": 0.8902, "step": 1119 }, { "epoch": 0.4659022957127629, "grad_norm": 1.2536909580230713, "learning_rate": 1.916425696580154e-05, "loss": 0.8717, "step": 1120 }, { "epoch": 0.4663182799053636, "grad_norm": 1.2178640365600586, "learning_rate": 1.9162463004585915e-05, "loss": 1.0557, "step": 1121 }, { "epoch": 0.46673426409796426, "grad_norm": 1.240866780281067, "learning_rate": 1.916066720417368e-05, "loss": 0.9864, "step": 1122 }, { "epoch": 0.46715024829056495, "grad_norm": 1.313364863395691, "learning_rate": 1.9158869564925313e-05, "loss": 0.8751, "step": 1123 }, { "epoch": 0.46756623248316564, "grad_norm": 1.3673666715621948, "learning_rate": 1.9157070087201654e-05, "loss": 1.0027, "step": 1124 }, { "epoch": 0.46798221667576634, "grad_norm": 1.210588812828064, "learning_rate": 1.9155268771363915e-05, "loss": 1.0977, "step": 1125 }, { "epoch": 0.468398200868367, "grad_norm": 1.4034714698791504, "learning_rate": 1.9153465617773678e-05, "loss": 0.994, "step": 1126 }, { "epoch": 0.46881418506096767, "grad_norm": 1.3829035758972168, "learning_rate": 1.9151660626792897e-05, "loss": 1.0026, "step": 1127 }, { "epoch": 0.46923016925356836, "grad_norm": 1.3256059885025024, "learning_rate": 1.9149853798783883e-05, "loss": 0.9615, "step": 1128 }, { "epoch": 0.46964615344616906, "grad_norm": 1.360345482826233, "learning_rate": 1.914804513410933e-05, "loss": 1.0078, "step": 1129 }, { "epoch": 0.47006213763876975, "grad_norm": 1.4145985841751099, "learning_rate": 1.9146234633132292e-05, "loss": 1.0701, "step": 1130 }, { "epoch": 0.4704781218313704, "grad_norm": 1.3360434770584106, "learning_rate": 1.9144422296216193e-05, "loss": 0.9314, "step": 1131 }, { "epoch": 0.4708941060239711, "grad_norm": 1.3567742109298706, "learning_rate": 1.914260812372483e-05, "loss": 0.8583, "step": 1132 }, { "epoch": 0.4713100902165718, "grad_norm": 1.3608393669128418, "learning_rate": 1.914079211602236e-05, "loss": 0.947, "step": 1133 }, { "epoch": 0.47172607440917247, "grad_norm": 1.4831615686416626, "learning_rate": 1.9138974273473315e-05, "loss": 0.9897, "step": 1134 }, { "epoch": 0.4721420586017731, "grad_norm": 1.4972509145736694, "learning_rate": 1.9137154596442595e-05, "loss": 0.8812, "step": 1135 }, { "epoch": 0.4725580427943738, "grad_norm": 1.4579721689224243, "learning_rate": 1.9135333085295465e-05, "loss": 0.9989, "step": 1136 }, { "epoch": 0.4729740269869745, "grad_norm": 1.3454123735427856, "learning_rate": 1.913350974039756e-05, "loss": 1.0035, "step": 1137 }, { "epoch": 0.4733900111795752, "grad_norm": 1.3640401363372803, "learning_rate": 1.9131684562114887e-05, "loss": 0.8048, "step": 1138 }, { "epoch": 0.4738059953721759, "grad_norm": 1.3207602500915527, "learning_rate": 1.9129857550813817e-05, "loss": 0.9099, "step": 1139 }, { "epoch": 0.4742219795647765, "grad_norm": 1.2744249105453491, "learning_rate": 1.912802870686108e-05, "loss": 0.9941, "step": 1140 }, { "epoch": 0.4746379637573772, "grad_norm": 5.563164710998535, "learning_rate": 1.912619803062379e-05, "loss": 0.9139, "step": 1141 }, { "epoch": 0.4750539479499779, "grad_norm": 1.4212507009506226, "learning_rate": 1.9124365522469424e-05, "loss": 1.1861, "step": 1142 }, { "epoch": 0.4754699321425786, "grad_norm": 1.3275035619735718, "learning_rate": 1.9122531182765817e-05, "loss": 0.9802, "step": 1143 }, { "epoch": 0.47588591633517924, "grad_norm": 1.2761014699935913, "learning_rate": 1.912069501188119e-05, "loss": 0.8382, "step": 1144 }, { "epoch": 0.47630190052777993, "grad_norm": 1.3255205154418945, "learning_rate": 1.9118857010184107e-05, "loss": 1.0107, "step": 1145 }, { "epoch": 0.4767178847203806, "grad_norm": 1.313738226890564, "learning_rate": 1.9117017178043523e-05, "loss": 0.9014, "step": 1146 }, { "epoch": 0.4771338689129813, "grad_norm": 1.311159610748291, "learning_rate": 1.911517551582875e-05, "loss": 0.927, "step": 1147 }, { "epoch": 0.477549853105582, "grad_norm": 1.261963963508606, "learning_rate": 1.9113332023909466e-05, "loss": 0.9228, "step": 1148 }, { "epoch": 0.47796583729818265, "grad_norm": 1.848658561706543, "learning_rate": 1.9111486702655714e-05, "loss": 0.9495, "step": 1149 }, { "epoch": 0.47838182149078334, "grad_norm": 1.3033440113067627, "learning_rate": 1.9109639552437915e-05, "loss": 0.9121, "step": 1150 }, { "epoch": 0.47879780568338404, "grad_norm": 1.2776395082473755, "learning_rate": 1.910779057362685e-05, "loss": 1.0039, "step": 1151 }, { "epoch": 0.47921378987598473, "grad_norm": 277.0197448730469, "learning_rate": 1.9105939766593663e-05, "loss": 0.9501, "step": 1152 }, { "epoch": 0.47962977406858537, "grad_norm": 1.6142213344573975, "learning_rate": 1.9104087131709873e-05, "loss": 0.9744, "step": 1153 }, { "epoch": 0.48004575826118606, "grad_norm": 1.4198602437973022, "learning_rate": 1.9102232669347362e-05, "loss": 1.0214, "step": 1154 }, { "epoch": 0.48046174245378676, "grad_norm": 1.4821361303329468, "learning_rate": 1.910037637987838e-05, "loss": 1.0051, "step": 1155 }, { "epoch": 0.48087772664638745, "grad_norm": 1.401656150817871, "learning_rate": 1.9098518263675542e-05, "loss": 0.9469, "step": 1156 }, { "epoch": 0.48129371083898814, "grad_norm": 1.2179243564605713, "learning_rate": 1.9096658321111828e-05, "loss": 0.9148, "step": 1157 }, { "epoch": 0.4817096950315888, "grad_norm": 1.4164255857467651, "learning_rate": 1.9094796552560596e-05, "loss": 0.9686, "step": 1158 }, { "epoch": 0.4821256792241895, "grad_norm": 1.3950669765472412, "learning_rate": 1.9092932958395553e-05, "loss": 0.9976, "step": 1159 }, { "epoch": 0.48254166341679017, "grad_norm": 1.317973256111145, "learning_rate": 1.9091067538990785e-05, "loss": 0.8955, "step": 1160 }, { "epoch": 0.48295764760939086, "grad_norm": 1.3497916460037231, "learning_rate": 1.9089200294720737e-05, "loss": 0.944, "step": 1161 }, { "epoch": 0.4833736318019915, "grad_norm": 1.3586950302124023, "learning_rate": 1.9087331225960232e-05, "loss": 0.8267, "step": 1162 }, { "epoch": 0.4837896159945922, "grad_norm": 1.5133851766586304, "learning_rate": 1.9085460333084447e-05, "loss": 0.9774, "step": 1163 }, { "epoch": 0.4842056001871929, "grad_norm": 1.3380718231201172, "learning_rate": 1.9083587616468928e-05, "loss": 0.9735, "step": 1164 }, { "epoch": 0.4846215843797936, "grad_norm": 1.2937462329864502, "learning_rate": 1.9081713076489592e-05, "loss": 0.8704, "step": 1165 }, { "epoch": 0.4850375685723943, "grad_norm": 1.3959592580795288, "learning_rate": 1.9079836713522715e-05, "loss": 0.9114, "step": 1166 }, { "epoch": 0.4854535527649949, "grad_norm": 1.4328175783157349, "learning_rate": 1.9077958527944944e-05, "loss": 0.9838, "step": 1167 }, { "epoch": 0.4858695369575956, "grad_norm": 1.9442048072814941, "learning_rate": 1.907607852013329e-05, "loss": 0.8153, "step": 1168 }, { "epoch": 0.4862855211501963, "grad_norm": 1.352392315864563, "learning_rate": 1.907419669046513e-05, "loss": 0.9099, "step": 1169 }, { "epoch": 0.486701505342797, "grad_norm": 1.3154784440994263, "learning_rate": 1.9072313039318208e-05, "loss": 0.9816, "step": 1170 }, { "epoch": 0.48711748953539763, "grad_norm": 1.531435251235962, "learning_rate": 1.907042756707063e-05, "loss": 1.0026, "step": 1171 }, { "epoch": 0.4875334737279983, "grad_norm": 1.3437968492507935, "learning_rate": 1.9068540274100876e-05, "loss": 0.7686, "step": 1172 }, { "epoch": 0.487949457920599, "grad_norm": 1.4637930393218994, "learning_rate": 1.9066651160787783e-05, "loss": 0.944, "step": 1173 }, { "epoch": 0.4883654421131997, "grad_norm": 1.3837467432022095, "learning_rate": 1.9064760227510547e-05, "loss": 1.017, "step": 1174 }, { "epoch": 0.4887814263058004, "grad_norm": 1.3058534860610962, "learning_rate": 1.906286747464875e-05, "loss": 0.9293, "step": 1175 }, { "epoch": 0.48919741049840104, "grad_norm": 1.2814832925796509, "learning_rate": 1.9060972902582323e-05, "loss": 0.8928, "step": 1176 }, { "epoch": 0.48961339469100174, "grad_norm": 1.328685998916626, "learning_rate": 1.9059076511691566e-05, "loss": 0.8678, "step": 1177 }, { "epoch": 0.49002937888360243, "grad_norm": 1.4344744682312012, "learning_rate": 1.9057178302357143e-05, "loss": 0.9436, "step": 1178 }, { "epoch": 0.4904453630762031, "grad_norm": 1.482980728149414, "learning_rate": 1.9055278274960088e-05, "loss": 1.061, "step": 1179 }, { "epoch": 0.49086134726880376, "grad_norm": 1.3468337059020996, "learning_rate": 1.9053376429881798e-05, "loss": 0.9184, "step": 1180 }, { "epoch": 0.49127733146140445, "grad_norm": 1.4965660572052002, "learning_rate": 1.905147276750403e-05, "loss": 0.9523, "step": 1181 }, { "epoch": 0.49169331565400515, "grad_norm": 1.4603571891784668, "learning_rate": 1.904956728820891e-05, "loss": 1.0396, "step": 1182 }, { "epoch": 0.49210929984660584, "grad_norm": 1.287436604499817, "learning_rate": 1.9047659992378927e-05, "loss": 0.9176, "step": 1183 }, { "epoch": 0.49252528403920653, "grad_norm": 1.407139778137207, "learning_rate": 1.904575088039694e-05, "loss": 0.8643, "step": 1184 }, { "epoch": 0.4929412682318072, "grad_norm": 1.2897592782974243, "learning_rate": 1.9043839952646167e-05, "loss": 0.9101, "step": 1185 }, { "epoch": 0.49335725242440787, "grad_norm": 1.3422729969024658, "learning_rate": 1.904192720951019e-05, "loss": 0.8806, "step": 1186 }, { "epoch": 0.49377323661700856, "grad_norm": 1.3989169597625732, "learning_rate": 1.9040012651372955e-05, "loss": 0.8881, "step": 1187 }, { "epoch": 0.49418922080960925, "grad_norm": 1.4360204935073853, "learning_rate": 1.9038096278618778e-05, "loss": 1.0129, "step": 1188 }, { "epoch": 0.4946052050022099, "grad_norm": 1.5211210250854492, "learning_rate": 1.9036178091632335e-05, "loss": 0.9231, "step": 1189 }, { "epoch": 0.4950211891948106, "grad_norm": 1.375605583190918, "learning_rate": 1.9034258090798666e-05, "loss": 0.9651, "step": 1190 }, { "epoch": 0.4954371733874113, "grad_norm": 1.439351201057434, "learning_rate": 1.9032336276503178e-05, "loss": 0.9754, "step": 1191 }, { "epoch": 0.49585315758001197, "grad_norm": 1.481101155281067, "learning_rate": 1.9030412649131637e-05, "loss": 1.0265, "step": 1192 }, { "epoch": 0.49626914177261267, "grad_norm": 1.4144830703735352, "learning_rate": 1.9028487209070178e-05, "loss": 0.9222, "step": 1193 }, { "epoch": 0.4966851259652133, "grad_norm": 1.4308006763458252, "learning_rate": 1.90265599567053e-05, "loss": 0.9097, "step": 1194 }, { "epoch": 0.497101110157814, "grad_norm": 1.2550328969955444, "learning_rate": 1.9024630892423857e-05, "loss": 0.8529, "step": 1195 }, { "epoch": 0.4975170943504147, "grad_norm": 1.3606452941894531, "learning_rate": 1.9022700016613077e-05, "loss": 0.8946, "step": 1196 }, { "epoch": 0.4979330785430154, "grad_norm": 3.9234414100646973, "learning_rate": 1.9020767329660553e-05, "loss": 0.8889, "step": 1197 }, { "epoch": 0.498349062735616, "grad_norm": 1.4164423942565918, "learning_rate": 1.9018832831954232e-05, "loss": 0.9465, "step": 1198 }, { "epoch": 0.4987650469282167, "grad_norm": 1.3399057388305664, "learning_rate": 1.9016896523882424e-05, "loss": 0.9469, "step": 1199 }, { "epoch": 0.4991810311208174, "grad_norm": 1.409984827041626, "learning_rate": 1.9014958405833816e-05, "loss": 1.0045, "step": 1200 }, { "epoch": 0.4995970153134181, "grad_norm": 1.3873180150985718, "learning_rate": 1.9013018478197443e-05, "loss": 1.0255, "step": 1201 }, { "epoch": 0.5000129995060187, "grad_norm": 1.383522629737854, "learning_rate": 1.9011076741362717e-05, "loss": 0.9102, "step": 1202 }, { "epoch": 0.5004289836986194, "grad_norm": 1.3438936471939087, "learning_rate": 1.9009133195719394e-05, "loss": 0.9898, "step": 1203 }, { "epoch": 0.5008449678912201, "grad_norm": 61.50424575805664, "learning_rate": 1.900718784165762e-05, "loss": 0.8496, "step": 1204 }, { "epoch": 0.5012609520838208, "grad_norm": 1.3702030181884766, "learning_rate": 1.900524067956788e-05, "loss": 0.8996, "step": 1205 }, { "epoch": 0.5016769362764215, "grad_norm": 1.3732714653015137, "learning_rate": 1.9003291709841034e-05, "loss": 0.9654, "step": 1206 }, { "epoch": 0.5020929204690222, "grad_norm": 1.354452133178711, "learning_rate": 1.9001340932868298e-05, "loss": 0.9174, "step": 1207 }, { "epoch": 0.5025089046616229, "grad_norm": 112.27633666992188, "learning_rate": 1.8999388349041265e-05, "loss": 0.9362, "step": 1208 }, { "epoch": 0.5029248888542235, "grad_norm": 1.2126797437667847, "learning_rate": 1.899743395875187e-05, "loss": 0.9142, "step": 1209 }, { "epoch": 0.5033408730468242, "grad_norm": 1.298490285873413, "learning_rate": 1.8995477762392423e-05, "loss": 0.8917, "step": 1210 }, { "epoch": 0.5037568572394249, "grad_norm": 1.2456433773040771, "learning_rate": 1.8993519760355597e-05, "loss": 0.9027, "step": 1211 }, { "epoch": 0.5041728414320256, "grad_norm": 1.2365660667419434, "learning_rate": 1.8991559953034425e-05, "loss": 0.8108, "step": 1212 }, { "epoch": 0.5045888256246263, "grad_norm": 1.4420632123947144, "learning_rate": 1.89895983408223e-05, "loss": 0.9536, "step": 1213 }, { "epoch": 0.505004809817227, "grad_norm": 1.3646659851074219, "learning_rate": 1.8987634924112983e-05, "loss": 0.9426, "step": 1214 }, { "epoch": 0.5054207940098276, "grad_norm": 1.3906028270721436, "learning_rate": 1.8985669703300593e-05, "loss": 0.8426, "step": 1215 }, { "epoch": 0.5058367782024283, "grad_norm": 26.242490768432617, "learning_rate": 1.8983702678779608e-05, "loss": 0.805, "step": 1216 }, { "epoch": 0.506252762395029, "grad_norm": 1.4041997194290161, "learning_rate": 1.898173385094488e-05, "loss": 1.0332, "step": 1217 }, { "epoch": 0.5066687465876296, "grad_norm": 1.372989296913147, "learning_rate": 1.8979763220191608e-05, "loss": 0.999, "step": 1218 }, { "epoch": 0.5070847307802303, "grad_norm": 1.361146092414856, "learning_rate": 1.8977790786915366e-05, "loss": 0.9553, "step": 1219 }, { "epoch": 0.507500714972831, "grad_norm": 1.5371532440185547, "learning_rate": 1.8975816551512076e-05, "loss": 1.0564, "step": 1220 }, { "epoch": 0.5079166991654317, "grad_norm": 1.5255887508392334, "learning_rate": 1.8973840514378034e-05, "loss": 1.0527, "step": 1221 }, { "epoch": 0.5083326833580324, "grad_norm": 1.496948003768921, "learning_rate": 1.8971862675909896e-05, "loss": 0.9321, "step": 1222 }, { "epoch": 0.5087486675506331, "grad_norm": 1.4089343547821045, "learning_rate": 1.8969883036504673e-05, "loss": 0.8959, "step": 1223 }, { "epoch": 0.5091646517432338, "grad_norm": 1.3218903541564941, "learning_rate": 1.8967901596559743e-05, "loss": 0.9219, "step": 1224 }, { "epoch": 0.5095806359358345, "grad_norm": 1.3533345460891724, "learning_rate": 1.8965918356472843e-05, "loss": 0.9269, "step": 1225 }, { "epoch": 0.5099966201284352, "grad_norm": 1.3433420658111572, "learning_rate": 1.8963933316642075e-05, "loss": 1.021, "step": 1226 }, { "epoch": 0.5104126043210357, "grad_norm": 1.33853280544281, "learning_rate": 1.8961946477465894e-05, "loss": 1.0217, "step": 1227 }, { "epoch": 0.5108285885136364, "grad_norm": 1.4799206256866455, "learning_rate": 1.8959957839343125e-05, "loss": 0.9594, "step": 1228 }, { "epoch": 0.5112445727062371, "grad_norm": 1.2651113271713257, "learning_rate": 1.895796740267295e-05, "loss": 0.8848, "step": 1229 }, { "epoch": 0.5116605568988378, "grad_norm": 1.3132308721542358, "learning_rate": 1.895597516785491e-05, "loss": 0.9645, "step": 1230 }, { "epoch": 0.5120765410914385, "grad_norm": 1.425881028175354, "learning_rate": 1.8953981135288915e-05, "loss": 0.986, "step": 1231 }, { "epoch": 0.5124925252840392, "grad_norm": 1.4590582847595215, "learning_rate": 1.8951985305375232e-05, "loss": 0.9294, "step": 1232 }, { "epoch": 0.5129085094766399, "grad_norm": 1.3159496784210205, "learning_rate": 1.8949987678514476e-05, "loss": 0.8958, "step": 1233 }, { "epoch": 0.5133244936692406, "grad_norm": 1.458682894706726, "learning_rate": 1.8947988255107644e-05, "loss": 0.9634, "step": 1234 }, { "epoch": 0.5137404778618413, "grad_norm": 1.3288187980651855, "learning_rate": 1.8945987035556084e-05, "loss": 0.8896, "step": 1235 }, { "epoch": 0.5141564620544419, "grad_norm": 1.2575626373291016, "learning_rate": 1.8943984020261495e-05, "loss": 0.8931, "step": 1236 }, { "epoch": 0.5145724462470426, "grad_norm": 1.4020060300827026, "learning_rate": 1.894197920962596e-05, "loss": 0.8595, "step": 1237 }, { "epoch": 0.5149884304396433, "grad_norm": 1.264275312423706, "learning_rate": 1.8939972604051892e-05, "loss": 0.9639, "step": 1238 }, { "epoch": 0.515404414632244, "grad_norm": 1.3954875469207764, "learning_rate": 1.893796420394209e-05, "loss": 1.0203, "step": 1239 }, { "epoch": 0.5158203988248447, "grad_norm": 1.2933363914489746, "learning_rate": 1.8935954009699703e-05, "loss": 0.9125, "step": 1240 }, { "epoch": 0.5162363830174453, "grad_norm": 1.3262174129486084, "learning_rate": 1.8933942021728238e-05, "loss": 0.9403, "step": 1241 }, { "epoch": 0.516652367210046, "grad_norm": 1.5306349992752075, "learning_rate": 1.8931928240431567e-05, "loss": 0.9839, "step": 1242 }, { "epoch": 0.5170683514026467, "grad_norm": 1.764039158821106, "learning_rate": 1.8929912666213916e-05, "loss": 0.865, "step": 1243 }, { "epoch": 0.5174843355952474, "grad_norm": 1.4177653789520264, "learning_rate": 1.8927895299479878e-05, "loss": 1.0385, "step": 1244 }, { "epoch": 0.517900319787848, "grad_norm": 648.2955322265625, "learning_rate": 1.8925876140634404e-05, "loss": 0.8645, "step": 1245 }, { "epoch": 0.5183163039804487, "grad_norm": 1.45958411693573, "learning_rate": 1.89238551900828e-05, "loss": 0.9426, "step": 1246 }, { "epoch": 0.5187322881730494, "grad_norm": 1.433025598526001, "learning_rate": 1.8921832448230737e-05, "loss": 0.8928, "step": 1247 }, { "epoch": 0.5191482723656501, "grad_norm": 1.410807490348816, "learning_rate": 1.8919807915484238e-05, "loss": 1.0349, "step": 1248 }, { "epoch": 0.5195642565582508, "grad_norm": 1.4288657903671265, "learning_rate": 1.8917781592249697e-05, "loss": 0.9752, "step": 1249 }, { "epoch": 0.5199802407508515, "grad_norm": 1.3425123691558838, "learning_rate": 1.891575347893386e-05, "loss": 0.8289, "step": 1250 }, { "epoch": 0.5203962249434522, "grad_norm": 1.3186370134353638, "learning_rate": 1.891372357594383e-05, "loss": 0.9296, "step": 1251 }, { "epoch": 0.5208122091360529, "grad_norm": 1.4767510890960693, "learning_rate": 1.8911691883687078e-05, "loss": 0.9793, "step": 1252 }, { "epoch": 0.5212281933286536, "grad_norm": 1.7405935525894165, "learning_rate": 1.8909658402571427e-05, "loss": 0.9457, "step": 1253 }, { "epoch": 0.5216441775212541, "grad_norm": 1.3564823865890503, "learning_rate": 1.890762313300506e-05, "loss": 0.8893, "step": 1254 }, { "epoch": 0.5220601617138548, "grad_norm": 1.3696666955947876, "learning_rate": 1.890558607539652e-05, "loss": 0.9258, "step": 1255 }, { "epoch": 0.5224761459064555, "grad_norm": 1.3836182355880737, "learning_rate": 1.8903547230154706e-05, "loss": 0.9343, "step": 1256 }, { "epoch": 0.5228921300990562, "grad_norm": 1.3675436973571777, "learning_rate": 1.890150659768889e-05, "loss": 0.934, "step": 1257 }, { "epoch": 0.5233081142916569, "grad_norm": 5.581067085266113, "learning_rate": 1.889946417840868e-05, "loss": 1.0355, "step": 1258 }, { "epoch": 0.5237240984842576, "grad_norm": 1.3901281356811523, "learning_rate": 1.8897419972724056e-05, "loss": 0.91, "step": 1259 }, { "epoch": 0.5241400826768583, "grad_norm": 1.3759044408798218, "learning_rate": 1.889537398104536e-05, "loss": 0.8391, "step": 1260 }, { "epoch": 0.524556066869459, "grad_norm": 1.2826098203659058, "learning_rate": 1.8893326203783285e-05, "loss": 0.8443, "step": 1261 }, { "epoch": 0.5249720510620597, "grad_norm": 1.3578007221221924, "learning_rate": 1.8891276641348883e-05, "loss": 1.0038, "step": 1262 }, { "epoch": 0.5253880352546603, "grad_norm": 1.4672558307647705, "learning_rate": 1.8889225294153567e-05, "loss": 0.9201, "step": 1263 }, { "epoch": 0.525804019447261, "grad_norm": 1.4155452251434326, "learning_rate": 1.8887172162609107e-05, "loss": 0.9142, "step": 1264 }, { "epoch": 0.5262200036398617, "grad_norm": 1.286329984664917, "learning_rate": 1.8885117247127635e-05, "loss": 0.8623, "step": 1265 }, { "epoch": 0.5266359878324623, "grad_norm": 1.328026533126831, "learning_rate": 1.8883060548121637e-05, "loss": 0.8453, "step": 1266 }, { "epoch": 0.527051972025063, "grad_norm": 1.4518990516662598, "learning_rate": 1.888100206600395e-05, "loss": 1.0234, "step": 1267 }, { "epoch": 0.5274679562176637, "grad_norm": 1.4719469547271729, "learning_rate": 1.887894180118779e-05, "loss": 0.955, "step": 1268 }, { "epoch": 0.5278839404102644, "grad_norm": 1.416170358657837, "learning_rate": 1.8876879754086702e-05, "loss": 0.8358, "step": 1269 }, { "epoch": 0.5282999246028651, "grad_norm": 1.3545684814453125, "learning_rate": 1.8874815925114615e-05, "loss": 0.9544, "step": 1270 }, { "epoch": 0.5287159087954658, "grad_norm": 1.4604121446609497, "learning_rate": 1.88727503146858e-05, "loss": 1.0163, "step": 1271 }, { "epoch": 0.5291318929880664, "grad_norm": 1.5146937370300293, "learning_rate": 1.8870682923214897e-05, "loss": 0.8795, "step": 1272 }, { "epoch": 0.5295478771806671, "grad_norm": 1.4033256769180298, "learning_rate": 1.8868613751116888e-05, "loss": 0.8934, "step": 1273 }, { "epoch": 0.5299638613732678, "grad_norm": 1.4064432382583618, "learning_rate": 1.8866542798807124e-05, "loss": 0.973, "step": 1274 }, { "epoch": 0.5303798455658685, "grad_norm": 1.4211106300354004, "learning_rate": 1.8864470066701317e-05, "loss": 0.9442, "step": 1275 }, { "epoch": 0.5307958297584692, "grad_norm": 1.3464194536209106, "learning_rate": 1.8862395555215522e-05, "loss": 0.9775, "step": 1276 }, { "epoch": 0.5312118139510699, "grad_norm": 1.4398199319839478, "learning_rate": 1.8860319264766163e-05, "loss": 0.9835, "step": 1277 }, { "epoch": 0.5316277981436706, "grad_norm": 1.411030888557434, "learning_rate": 1.8858241195770016e-05, "loss": 0.8994, "step": 1278 }, { "epoch": 0.5320437823362713, "grad_norm": 1.6852378845214844, "learning_rate": 1.8856161348644217e-05, "loss": 0.9067, "step": 1279 }, { "epoch": 0.532459766528872, "grad_norm": 13.965109825134277, "learning_rate": 1.8854079723806257e-05, "loss": 0.9232, "step": 1280 }, { "epoch": 0.5328757507214725, "grad_norm": 1.328009009361267, "learning_rate": 1.8851996321673983e-05, "loss": 1.0278, "step": 1281 }, { "epoch": 0.5332917349140732, "grad_norm": 1.5005353689193726, "learning_rate": 1.8849911142665597e-05, "loss": 1.1141, "step": 1282 }, { "epoch": 0.5337077191066739, "grad_norm": 1.4769259691238403, "learning_rate": 1.8847824187199667e-05, "loss": 0.9507, "step": 1283 }, { "epoch": 0.5341237032992746, "grad_norm": 1.2979477643966675, "learning_rate": 1.8845735455695107e-05, "loss": 0.9031, "step": 1284 }, { "epoch": 0.5345396874918753, "grad_norm": 1.3786081075668335, "learning_rate": 1.8843644948571195e-05, "loss": 1.0075, "step": 1285 }, { "epoch": 0.534955671684476, "grad_norm": 1.4004333019256592, "learning_rate": 1.8841552666247556e-05, "loss": 0.8922, "step": 1286 }, { "epoch": 0.5353716558770767, "grad_norm": 1.5097477436065674, "learning_rate": 1.8839458609144178e-05, "loss": 1.0921, "step": 1287 }, { "epoch": 0.5357876400696774, "grad_norm": 1.2807544469833374, "learning_rate": 1.8837362777681413e-05, "loss": 0.9309, "step": 1288 }, { "epoch": 0.5362036242622781, "grad_norm": 1.5956394672393799, "learning_rate": 1.8835265172279952e-05, "loss": 1.0471, "step": 1289 }, { "epoch": 0.5366196084548787, "grad_norm": 1.3170928955078125, "learning_rate": 1.8833165793360852e-05, "loss": 0.9504, "step": 1290 }, { "epoch": 0.5370355926474794, "grad_norm": 1.346318244934082, "learning_rate": 1.883106464134553e-05, "loss": 0.9558, "step": 1291 }, { "epoch": 0.53745157684008, "grad_norm": 1.4396618604660034, "learning_rate": 1.8828961716655748e-05, "loss": 0.8852, "step": 1292 }, { "epoch": 0.5378675610326807, "grad_norm": 1.4400832653045654, "learning_rate": 1.8826857019713635e-05, "loss": 0.9163, "step": 1293 }, { "epoch": 0.5382835452252814, "grad_norm": 1.3637775182724, "learning_rate": 1.8824750550941664e-05, "loss": 0.949, "step": 1294 }, { "epoch": 0.5386995294178821, "grad_norm": 1.4780441522598267, "learning_rate": 1.8822642310762673e-05, "loss": 0.9428, "step": 1295 }, { "epoch": 0.5391155136104828, "grad_norm": 1.2974770069122314, "learning_rate": 1.8820532299599858e-05, "loss": 0.9543, "step": 1296 }, { "epoch": 0.5395314978030835, "grad_norm": 1.3114370107650757, "learning_rate": 1.8818420517876754e-05, "loss": 0.9111, "step": 1297 }, { "epoch": 0.5399474819956842, "grad_norm": 1.4994982481002808, "learning_rate": 1.8816306966017276e-05, "loss": 0.8699, "step": 1298 }, { "epoch": 0.5403634661882848, "grad_norm": 1.7146719694137573, "learning_rate": 1.8814191644445667e-05, "loss": 0.9252, "step": 1299 }, { "epoch": 0.5407794503808855, "grad_norm": 1.6877524852752686, "learning_rate": 1.881207455358655e-05, "loss": 0.9356, "step": 1300 }, { "epoch": 0.5411954345734862, "grad_norm": 1.37538743019104, "learning_rate": 1.8809955693864882e-05, "loss": 0.784, "step": 1301 }, { "epoch": 0.5416114187660869, "grad_norm": 1.4291024208068848, "learning_rate": 1.8807835065705998e-05, "loss": 0.8745, "step": 1302 }, { "epoch": 0.5420274029586876, "grad_norm": 1.488782525062561, "learning_rate": 1.880571266953556e-05, "loss": 0.8854, "step": 1303 }, { "epoch": 0.5424433871512883, "grad_norm": 1.4658159017562866, "learning_rate": 1.8803588505779617e-05, "loss": 0.834, "step": 1304 }, { "epoch": 0.542859371343889, "grad_norm": 1.4375289678573608, "learning_rate": 1.8801462574864543e-05, "loss": 0.9374, "step": 1305 }, { "epoch": 0.5432753555364896, "grad_norm": 1.5607041120529175, "learning_rate": 1.8799334877217086e-05, "loss": 0.9159, "step": 1306 }, { "epoch": 0.5436913397290903, "grad_norm": 1.3827277421951294, "learning_rate": 1.879720541326434e-05, "loss": 0.9615, "step": 1307 }, { "epoch": 0.5441073239216909, "grad_norm": 1.4051735401153564, "learning_rate": 1.8795074183433754e-05, "loss": 0.9386, "step": 1308 }, { "epoch": 0.5445233081142916, "grad_norm": 1.3671847581863403, "learning_rate": 1.8792941188153136e-05, "loss": 0.9928, "step": 1309 }, { "epoch": 0.5449392923068923, "grad_norm": 1.6321748495101929, "learning_rate": 1.8790806427850646e-05, "loss": 1.1086, "step": 1310 }, { "epoch": 0.545355276499493, "grad_norm": 388.0087890625, "learning_rate": 1.8788669902954798e-05, "loss": 0.8775, "step": 1311 }, { "epoch": 0.5457712606920937, "grad_norm": 1.2729294300079346, "learning_rate": 1.8786531613894462e-05, "loss": 0.9142, "step": 1312 }, { "epoch": 0.5461872448846944, "grad_norm": 1.3760881423950195, "learning_rate": 1.8784391561098854e-05, "loss": 0.9782, "step": 1313 }, { "epoch": 0.5466032290772951, "grad_norm": 1.4082268476486206, "learning_rate": 1.8782249744997554e-05, "loss": 1.0404, "step": 1314 }, { "epoch": 0.5470192132698958, "grad_norm": 1.45498526096344, "learning_rate": 1.8780106166020497e-05, "loss": 0.8899, "step": 1315 }, { "epoch": 0.5474351974624965, "grad_norm": 1.364083170890808, "learning_rate": 1.877796082459796e-05, "loss": 0.9362, "step": 1316 }, { "epoch": 0.547851181655097, "grad_norm": 1.4201263189315796, "learning_rate": 1.877581372116058e-05, "loss": 0.9168, "step": 1317 }, { "epoch": 0.5482671658476977, "grad_norm": 1.6078886985778809, "learning_rate": 1.8773664856139362e-05, "loss": 0.9817, "step": 1318 }, { "epoch": 0.5486831500402984, "grad_norm": 1.4103559255599976, "learning_rate": 1.8771514229965637e-05, "loss": 0.9687, "step": 1319 }, { "epoch": 0.5490991342328991, "grad_norm": 1.3296056985855103, "learning_rate": 1.8769361843071107e-05, "loss": 0.8848, "step": 1320 }, { "epoch": 0.5495151184254998, "grad_norm": 1.362423300743103, "learning_rate": 1.8767207695887827e-05, "loss": 0.9319, "step": 1321 }, { "epoch": 0.5499311026181005, "grad_norm": 1.4283945560455322, "learning_rate": 1.8765051788848205e-05, "loss": 0.8601, "step": 1322 }, { "epoch": 0.5503470868107012, "grad_norm": 1.4226040840148926, "learning_rate": 1.8762894122384992e-05, "loss": 0.9515, "step": 1323 }, { "epoch": 0.5507630710033019, "grad_norm": 1.3877066373825073, "learning_rate": 1.8760734696931307e-05, "loss": 0.9832, "step": 1324 }, { "epoch": 0.5511790551959026, "grad_norm": 1.4953759908676147, "learning_rate": 1.8758573512920608e-05, "loss": 1.0044, "step": 1325 }, { "epoch": 0.5515950393885032, "grad_norm": 1.600140929222107, "learning_rate": 1.875641057078672e-05, "loss": 0.9512, "step": 1326 }, { "epoch": 0.5520110235811039, "grad_norm": 1.335739254951477, "learning_rate": 1.8754245870963814e-05, "loss": 0.7981, "step": 1327 }, { "epoch": 0.5524270077737046, "grad_norm": 1.460955023765564, "learning_rate": 1.875207941388641e-05, "loss": 1.0485, "step": 1328 }, { "epoch": 0.5528429919663053, "grad_norm": 1.2958874702453613, "learning_rate": 1.8749911199989386e-05, "loss": 0.8712, "step": 1329 }, { "epoch": 0.553258976158906, "grad_norm": 1.3860334157943726, "learning_rate": 1.8747741229707972e-05, "loss": 0.9106, "step": 1330 }, { "epoch": 0.5536749603515067, "grad_norm": 1.5706672668457031, "learning_rate": 1.874556950347775e-05, "loss": 0.9247, "step": 1331 }, { "epoch": 0.5540909445441073, "grad_norm": 1.4388095140457153, "learning_rate": 1.874339602173465e-05, "loss": 0.9154, "step": 1332 }, { "epoch": 0.554506928736708, "grad_norm": 1.342272400856018, "learning_rate": 1.8741220784914964e-05, "loss": 0.808, "step": 1333 }, { "epoch": 0.5549229129293087, "grad_norm": 1.355452299118042, "learning_rate": 1.8739043793455327e-05, "loss": 0.9669, "step": 1334 }, { "epoch": 0.5553388971219093, "grad_norm": 1.3914915323257446, "learning_rate": 1.8736865047792734e-05, "loss": 0.9929, "step": 1335 }, { "epoch": 0.55575488131451, "grad_norm": 1.4073097705841064, "learning_rate": 1.8734684548364527e-05, "loss": 0.9681, "step": 1336 }, { "epoch": 0.5561708655071107, "grad_norm": 1.4216119050979614, "learning_rate": 1.87325022956084e-05, "loss": 0.882, "step": 1337 }, { "epoch": 0.5565868496997114, "grad_norm": 1.4394294023513794, "learning_rate": 1.87303182899624e-05, "loss": 0.9034, "step": 1338 }, { "epoch": 0.5570028338923121, "grad_norm": 1.61885404586792, "learning_rate": 1.8728132531864926e-05, "loss": 1.009, "step": 1339 }, { "epoch": 0.5574188180849128, "grad_norm": 1.3883622884750366, "learning_rate": 1.8725945021754733e-05, "loss": 0.8779, "step": 1340 }, { "epoch": 0.5578348022775135, "grad_norm": 1.3473931550979614, "learning_rate": 1.8723755760070916e-05, "loss": 0.671, "step": 1341 }, { "epoch": 0.5582507864701142, "grad_norm": 1.3804028034210205, "learning_rate": 1.8721564747252936e-05, "loss": 1.072, "step": 1342 }, { "epoch": 0.5586667706627149, "grad_norm": 1.421329379081726, "learning_rate": 1.87193719837406e-05, "loss": 0.9225, "step": 1343 }, { "epoch": 0.5590827548553154, "grad_norm": 1.3435083627700806, "learning_rate": 1.8717177469974054e-05, "loss": 0.9762, "step": 1344 }, { "epoch": 0.5594987390479161, "grad_norm": 1.3823734521865845, "learning_rate": 1.8714981206393822e-05, "loss": 0.9533, "step": 1345 }, { "epoch": 0.5599147232405168, "grad_norm": 1.3804699182510376, "learning_rate": 1.871278319344075e-05, "loss": 1.0757, "step": 1346 }, { "epoch": 0.5603307074331175, "grad_norm": 1.4381251335144043, "learning_rate": 1.8710583431556057e-05, "loss": 1.0119, "step": 1347 }, { "epoch": 0.5607466916257182, "grad_norm": 1.3237266540527344, "learning_rate": 1.87083819211813e-05, "loss": 1.0381, "step": 1348 }, { "epoch": 0.5611626758183189, "grad_norm": 1.3797398805618286, "learning_rate": 1.8706178662758396e-05, "loss": 0.9902, "step": 1349 }, { "epoch": 0.5615786600109196, "grad_norm": 1.4716904163360596, "learning_rate": 1.8703973656729606e-05, "loss": 1.0103, "step": 1350 }, { "epoch": 0.5619946442035203, "grad_norm": 2.0739200115203857, "learning_rate": 1.8701766903537548e-05, "loss": 0.9084, "step": 1351 }, { "epoch": 0.562410628396121, "grad_norm": 1.3811863660812378, "learning_rate": 1.8699558403625184e-05, "loss": 1.053, "step": 1352 }, { "epoch": 0.5628266125887216, "grad_norm": 1.4081412553787231, "learning_rate": 1.8697348157435828e-05, "loss": 0.8645, "step": 1353 }, { "epoch": 0.5632425967813223, "grad_norm": 1.5010985136032104, "learning_rate": 1.869513616541315e-05, "loss": 0.9165, "step": 1354 }, { "epoch": 0.563658580973923, "grad_norm": 1.4287663698196411, "learning_rate": 1.8692922428001166e-05, "loss": 0.9507, "step": 1355 }, { "epoch": 0.5640745651665237, "grad_norm": 1.4240484237670898, "learning_rate": 1.8690706945644245e-05, "loss": 0.9404, "step": 1356 }, { "epoch": 0.5644905493591243, "grad_norm": 1.378023624420166, "learning_rate": 1.8688489718787103e-05, "loss": 0.8237, "step": 1357 }, { "epoch": 0.564906533551725, "grad_norm": 47.65007781982422, "learning_rate": 1.8686270747874807e-05, "loss": 0.9918, "step": 1358 }, { "epoch": 0.5653225177443257, "grad_norm": 1.5806388854980469, "learning_rate": 1.8684050033352776e-05, "loss": 0.9341, "step": 1359 }, { "epoch": 0.5657385019369264, "grad_norm": 1.4747915267944336, "learning_rate": 1.8681827575666774e-05, "loss": 0.9528, "step": 1360 }, { "epoch": 0.5661544861295271, "grad_norm": 1.4905486106872559, "learning_rate": 1.8679603375262926e-05, "loss": 0.8299, "step": 1361 }, { "epoch": 0.5665704703221277, "grad_norm": 1.3668603897094727, "learning_rate": 1.8677377432587694e-05, "loss": 0.7871, "step": 1362 }, { "epoch": 0.5669864545147284, "grad_norm": 1.2786792516708374, "learning_rate": 1.8675149748087892e-05, "loss": 0.8083, "step": 1363 }, { "epoch": 0.5674024387073291, "grad_norm": 1.5769413709640503, "learning_rate": 1.8672920322210696e-05, "loss": 1.0476, "step": 1364 }, { "epoch": 0.5678184228999298, "grad_norm": 1.4331473112106323, "learning_rate": 1.8670689155403618e-05, "loss": 0.9099, "step": 1365 }, { "epoch": 0.5682344070925305, "grad_norm": 1.5399389266967773, "learning_rate": 1.866845624811452e-05, "loss": 1.0242, "step": 1366 }, { "epoch": 0.5686503912851312, "grad_norm": 1.494019865989685, "learning_rate": 1.866622160079162e-05, "loss": 0.9828, "step": 1367 }, { "epoch": 0.5690663754777319, "grad_norm": 1.9607455730438232, "learning_rate": 1.8663985213883485e-05, "loss": 0.9218, "step": 1368 }, { "epoch": 0.5694823596703326, "grad_norm": 52.00717544555664, "learning_rate": 1.8661747087839027e-05, "loss": 0.9727, "step": 1369 }, { "epoch": 0.5698983438629333, "grad_norm": 1.4321558475494385, "learning_rate": 1.865950722310751e-05, "loss": 1.0323, "step": 1370 }, { "epoch": 0.5703143280555338, "grad_norm": 1.3761377334594727, "learning_rate": 1.8657265620138543e-05, "loss": 0.8034, "step": 1371 }, { "epoch": 0.5707303122481345, "grad_norm": 1.4099663496017456, "learning_rate": 1.865502227938209e-05, "loss": 0.8778, "step": 1372 }, { "epoch": 0.5711462964407352, "grad_norm": 1.3845477104187012, "learning_rate": 1.8652777201288458e-05, "loss": 0.9019, "step": 1373 }, { "epoch": 0.5715622806333359, "grad_norm": 1.439592719078064, "learning_rate": 1.8650530386308308e-05, "loss": 1.0119, "step": 1374 }, { "epoch": 0.5719782648259366, "grad_norm": 1.5680198669433594, "learning_rate": 1.864828183489264e-05, "loss": 0.9437, "step": 1375 }, { "epoch": 0.5723942490185373, "grad_norm": 1.450745701789856, "learning_rate": 1.864603154749282e-05, "loss": 0.8561, "step": 1376 }, { "epoch": 0.572810233211138, "grad_norm": 1.3203610181808472, "learning_rate": 1.8643779524560543e-05, "loss": 0.9928, "step": 1377 }, { "epoch": 0.5732262174037387, "grad_norm": 1.4203170537948608, "learning_rate": 1.8641525766547866e-05, "loss": 0.9267, "step": 1378 }, { "epoch": 0.5736422015963394, "grad_norm": 4.577019214630127, "learning_rate": 1.863927027390719e-05, "loss": 0.9874, "step": 1379 }, { "epoch": 0.57405818578894, "grad_norm": 1.4339542388916016, "learning_rate": 1.8637013047091258e-05, "loss": 0.971, "step": 1380 }, { "epoch": 0.5744741699815407, "grad_norm": 1.3186275959014893, "learning_rate": 1.8634754086553178e-05, "loss": 0.9531, "step": 1381 }, { "epoch": 0.5748901541741414, "grad_norm": 1.302496075630188, "learning_rate": 1.8632493392746385e-05, "loss": 0.9012, "step": 1382 }, { "epoch": 0.575306138366742, "grad_norm": 1.4284160137176514, "learning_rate": 1.8630230966124674e-05, "loss": 0.9394, "step": 1383 }, { "epoch": 0.5757221225593427, "grad_norm": 1.3648531436920166, "learning_rate": 1.8627966807142187e-05, "loss": 0.8814, "step": 1384 }, { "epoch": 0.5761381067519434, "grad_norm": 1.4299086332321167, "learning_rate": 1.8625700916253415e-05, "loss": 0.9156, "step": 1385 }, { "epoch": 0.5765540909445441, "grad_norm": 1.3967705965042114, "learning_rate": 1.8623433293913188e-05, "loss": 0.9174, "step": 1386 }, { "epoch": 0.5769700751371448, "grad_norm": 1.3559279441833496, "learning_rate": 1.8621163940576697e-05, "loss": 0.8349, "step": 1387 }, { "epoch": 0.5773860593297455, "grad_norm": 1.4870116710662842, "learning_rate": 1.8618892856699467e-05, "loss": 0.9136, "step": 1388 }, { "epoch": 0.5778020435223461, "grad_norm": 1.3566172122955322, "learning_rate": 1.8616620042737378e-05, "loss": 0.8225, "step": 1389 }, { "epoch": 0.5782180277149468, "grad_norm": 1.298852801322937, "learning_rate": 1.8614345499146655e-05, "loss": 0.789, "step": 1390 }, { "epoch": 0.5786340119075475, "grad_norm": 1.3384932279586792, "learning_rate": 1.8612069226383876e-05, "loss": 0.8975, "step": 1391 }, { "epoch": 0.5790499961001482, "grad_norm": 1.4446903467178345, "learning_rate": 1.8609791224905955e-05, "loss": 0.9282, "step": 1392 }, { "epoch": 0.5794659802927489, "grad_norm": 1.5014679431915283, "learning_rate": 1.8607511495170163e-05, "loss": 0.8598, "step": 1393 }, { "epoch": 0.5798819644853496, "grad_norm": 1.4465806484222412, "learning_rate": 1.8605230037634112e-05, "loss": 0.9992, "step": 1394 }, { "epoch": 0.5802979486779503, "grad_norm": 1.454960823059082, "learning_rate": 1.860294685275576e-05, "loss": 1.0263, "step": 1395 }, { "epoch": 0.580713932870551, "grad_norm": 1.3277530670166016, "learning_rate": 1.8600661940993424e-05, "loss": 0.8334, "step": 1396 }, { "epoch": 0.5811299170631516, "grad_norm": 24.076637268066406, "learning_rate": 1.859837530280575e-05, "loss": 0.9378, "step": 1397 }, { "epoch": 0.5815459012557522, "grad_norm": 1.4009581804275513, "learning_rate": 1.8596086938651736e-05, "loss": 0.9201, "step": 1398 }, { "epoch": 0.5819618854483529, "grad_norm": 1.3398404121398926, "learning_rate": 1.8593796848990736e-05, "loss": 1.0166, "step": 1399 }, { "epoch": 0.5823778696409536, "grad_norm": 1.5045256614685059, "learning_rate": 1.8591505034282446e-05, "loss": 1.0208, "step": 1400 }, { "epoch": 0.5827938538335543, "grad_norm": 1.561253309249878, "learning_rate": 1.8589211494986896e-05, "loss": 0.9235, "step": 1401 }, { "epoch": 0.583209838026155, "grad_norm": 1.4654806852340698, "learning_rate": 1.8586916231564483e-05, "loss": 0.9386, "step": 1402 }, { "epoch": 0.5836258222187557, "grad_norm": 1.440285325050354, "learning_rate": 1.8584619244475928e-05, "loss": 0.98, "step": 1403 }, { "epoch": 0.5840418064113564, "grad_norm": 6.423582077026367, "learning_rate": 1.8582320534182315e-05, "loss": 0.951, "step": 1404 }, { "epoch": 0.5844577906039571, "grad_norm": 1.3785481452941895, "learning_rate": 1.858002010114507e-05, "loss": 0.9432, "step": 1405 }, { "epoch": 0.5848737747965578, "grad_norm": 1.4277480840682983, "learning_rate": 1.857771794582596e-05, "loss": 0.976, "step": 1406 }, { "epoch": 0.5852897589891584, "grad_norm": 1.4100137948989868, "learning_rate": 1.8575414068687098e-05, "loss": 0.8985, "step": 1407 }, { "epoch": 0.585705743181759, "grad_norm": 1.4704440832138062, "learning_rate": 1.857310847019095e-05, "loss": 0.9246, "step": 1408 }, { "epoch": 0.5861217273743597, "grad_norm": 1.439522385597229, "learning_rate": 1.8570801150800323e-05, "loss": 0.9605, "step": 1409 }, { "epoch": 0.5865377115669604, "grad_norm": 1.3754841089248657, "learning_rate": 1.8568492110978363e-05, "loss": 0.9064, "step": 1410 }, { "epoch": 0.5869536957595611, "grad_norm": 1.4575386047363281, "learning_rate": 1.856618135118857e-05, "loss": 0.9412, "step": 1411 }, { "epoch": 0.5873696799521618, "grad_norm": 1.4203076362609863, "learning_rate": 1.856386887189479e-05, "loss": 0.8629, "step": 1412 }, { "epoch": 0.5877856641447625, "grad_norm": 55.694847106933594, "learning_rate": 1.856155467356121e-05, "loss": 0.9509, "step": 1413 }, { "epoch": 0.5882016483373632, "grad_norm": 1.2368370294570923, "learning_rate": 1.8559238756652357e-05, "loss": 0.8275, "step": 1414 }, { "epoch": 0.5886176325299639, "grad_norm": 1.1894137859344482, "learning_rate": 1.8556921121633117e-05, "loss": 0.8254, "step": 1415 }, { "epoch": 0.5890336167225645, "grad_norm": 1.2863622903823853, "learning_rate": 1.855460176896871e-05, "loss": 0.8457, "step": 1416 }, { "epoch": 0.5894496009151652, "grad_norm": 1.4421943426132202, "learning_rate": 1.85522806991247e-05, "loss": 0.9194, "step": 1417 }, { "epoch": 0.5898655851077659, "grad_norm": 1.450951337814331, "learning_rate": 1.8549957912567005e-05, "loss": 0.8891, "step": 1418 }, { "epoch": 0.5902815693003666, "grad_norm": 97.08625030517578, "learning_rate": 1.8547633409761877e-05, "loss": 0.8795, "step": 1419 }, { "epoch": 0.5906975534929673, "grad_norm": 1.3588882684707642, "learning_rate": 1.8545307191175918e-05, "loss": 0.8331, "step": 1420 }, { "epoch": 0.591113537685568, "grad_norm": 1.4596487283706665, "learning_rate": 1.854297925727608e-05, "loss": 0.9359, "step": 1421 }, { "epoch": 0.5915295218781687, "grad_norm": 1.5546504259109497, "learning_rate": 1.8540649608529642e-05, "loss": 1.0619, "step": 1422 }, { "epoch": 0.5919455060707693, "grad_norm": 1.4651780128479004, "learning_rate": 1.853831824540425e-05, "loss": 0.9074, "step": 1423 }, { "epoch": 0.59236149026337, "grad_norm": 1.4058136940002441, "learning_rate": 1.8535985168367875e-05, "loss": 0.821, "step": 1424 }, { "epoch": 0.5927774744559706, "grad_norm": 1.4751912355422974, "learning_rate": 1.853365037788884e-05, "loss": 0.9749, "step": 1425 }, { "epoch": 0.5931934586485713, "grad_norm": 1.4112811088562012, "learning_rate": 1.8531313874435814e-05, "loss": 0.8996, "step": 1426 }, { "epoch": 0.593609442841172, "grad_norm": 1.317168116569519, "learning_rate": 1.8528975658477802e-05, "loss": 0.9402, "step": 1427 }, { "epoch": 0.5940254270337727, "grad_norm": 1.384986162185669, "learning_rate": 1.8526635730484167e-05, "loss": 0.9743, "step": 1428 }, { "epoch": 0.5944414112263734, "grad_norm": 1.361762523651123, "learning_rate": 1.85242940909246e-05, "loss": 0.8247, "step": 1429 }, { "epoch": 0.5948573954189741, "grad_norm": 1.4150714874267578, "learning_rate": 1.8521950740269147e-05, "loss": 0.9753, "step": 1430 }, { "epoch": 0.5952733796115748, "grad_norm": 1.44558584690094, "learning_rate": 1.8519605678988185e-05, "loss": 0.8574, "step": 1431 }, { "epoch": 0.5956893638041755, "grad_norm": 1.43723464012146, "learning_rate": 1.8517258907552447e-05, "loss": 0.9569, "step": 1432 }, { "epoch": 0.5961053479967762, "grad_norm": 1.4967139959335327, "learning_rate": 1.8514910426433007e-05, "loss": 0.937, "step": 1433 }, { "epoch": 0.5965213321893768, "grad_norm": 1.4794517755508423, "learning_rate": 1.8512560236101276e-05, "loss": 0.9706, "step": 1434 }, { "epoch": 0.5969373163819774, "grad_norm": 1.3915947675704956, "learning_rate": 1.8510208337029012e-05, "loss": 0.9007, "step": 1435 }, { "epoch": 0.5973533005745781, "grad_norm": 1.3954490423202515, "learning_rate": 1.8507854729688313e-05, "loss": 1.0132, "step": 1436 }, { "epoch": 0.5977692847671788, "grad_norm": 1.4940071105957031, "learning_rate": 1.850549941455163e-05, "loss": 0.8914, "step": 1437 }, { "epoch": 0.5981852689597795, "grad_norm": 1.239787220954895, "learning_rate": 1.8503142392091745e-05, "loss": 0.796, "step": 1438 }, { "epoch": 0.5986012531523802, "grad_norm": 1.3793061971664429, "learning_rate": 1.8500783662781785e-05, "loss": 0.8409, "step": 1439 }, { "epoch": 0.5990172373449809, "grad_norm": 1.3941645622253418, "learning_rate": 1.8498423227095224e-05, "loss": 0.9878, "step": 1440 }, { "epoch": 0.5994332215375816, "grad_norm": 1.542241096496582, "learning_rate": 1.849606108550588e-05, "loss": 0.8141, "step": 1441 }, { "epoch": 0.5998492057301823, "grad_norm": 1.4677157402038574, "learning_rate": 1.8493697238487897e-05, "loss": 0.873, "step": 1442 }, { "epoch": 0.6002651899227829, "grad_norm": 1.5060995817184448, "learning_rate": 1.849133168651579e-05, "loss": 0.8139, "step": 1443 }, { "epoch": 0.6006811741153836, "grad_norm": 1.6413167715072632, "learning_rate": 1.848896443006439e-05, "loss": 0.9777, "step": 1444 }, { "epoch": 0.6010971583079843, "grad_norm": 1.362534761428833, "learning_rate": 1.8486595469608883e-05, "loss": 0.9742, "step": 1445 }, { "epoch": 0.601513142500585, "grad_norm": 1.4069232940673828, "learning_rate": 1.8484224805624797e-05, "loss": 0.8925, "step": 1446 }, { "epoch": 0.6019291266931857, "grad_norm": 1.4706356525421143, "learning_rate": 1.8481852438587994e-05, "loss": 0.9759, "step": 1447 }, { "epoch": 0.6023451108857864, "grad_norm": 1.7047970294952393, "learning_rate": 1.847947836897469e-05, "loss": 0.9047, "step": 1448 }, { "epoch": 0.602761095078387, "grad_norm": 1.3043224811553955, "learning_rate": 1.847710259726143e-05, "loss": 0.8904, "step": 1449 }, { "epoch": 0.6031770792709877, "grad_norm": 1.4194327592849731, "learning_rate": 1.8474725123925107e-05, "loss": 0.93, "step": 1450 }, { "epoch": 0.6035930634635884, "grad_norm": 1.4959709644317627, "learning_rate": 1.847234594944296e-05, "loss": 0.9251, "step": 1451 }, { "epoch": 0.604009047656189, "grad_norm": 1.5254113674163818, "learning_rate": 1.846996507429256e-05, "loss": 0.9452, "step": 1452 }, { "epoch": 0.6044250318487897, "grad_norm": 1.47559654712677, "learning_rate": 1.846758249895183e-05, "loss": 0.9289, "step": 1453 }, { "epoch": 0.6048410160413904, "grad_norm": 1.3878194093704224, "learning_rate": 1.846519822389902e-05, "loss": 0.8358, "step": 1454 }, { "epoch": 0.6052570002339911, "grad_norm": 78.63619232177734, "learning_rate": 1.846281224961273e-05, "loss": 0.8869, "step": 1455 }, { "epoch": 0.6056729844265918, "grad_norm": 1.4088162183761597, "learning_rate": 1.846042457657191e-05, "loss": 0.9588, "step": 1456 }, { "epoch": 0.6060889686191925, "grad_norm": 1.5063369274139404, "learning_rate": 1.8458035205255837e-05, "loss": 0.9145, "step": 1457 }, { "epoch": 0.6065049528117932, "grad_norm": 1.3973530530929565, "learning_rate": 1.845564413614413e-05, "loss": 0.9237, "step": 1458 }, { "epoch": 0.6069209370043939, "grad_norm": 1.4442157745361328, "learning_rate": 1.8453251369716757e-05, "loss": 0.9009, "step": 1459 }, { "epoch": 0.6073369211969946, "grad_norm": 1.3979978561401367, "learning_rate": 1.845085690645402e-05, "loss": 0.8371, "step": 1460 }, { "epoch": 0.6077529053895951, "grad_norm": 1.5177502632141113, "learning_rate": 1.8448460746836558e-05, "loss": 0.9167, "step": 1461 }, { "epoch": 0.6081688895821958, "grad_norm": 1.526997447013855, "learning_rate": 1.8446062891345366e-05, "loss": 0.9767, "step": 1462 }, { "epoch": 0.6085848737747965, "grad_norm": 1.5276706218719482, "learning_rate": 1.8443663340461766e-05, "loss": 0.9301, "step": 1463 }, { "epoch": 0.6090008579673972, "grad_norm": 1.5068203210830688, "learning_rate": 1.844126209466742e-05, "loss": 0.889, "step": 1464 }, { "epoch": 0.6094168421599979, "grad_norm": 1.5722923278808594, "learning_rate": 1.843885915444434e-05, "loss": 0.9098, "step": 1465 }, { "epoch": 0.6098328263525986, "grad_norm": 1.5716183185577393, "learning_rate": 1.843645452027487e-05, "loss": 0.9055, "step": 1466 }, { "epoch": 0.6102488105451993, "grad_norm": 1.5935124158859253, "learning_rate": 1.8434048192641694e-05, "loss": 0.9635, "step": 1467 }, { "epoch": 0.6106647947378, "grad_norm": 1.4653584957122803, "learning_rate": 1.843164017202784e-05, "loss": 0.9928, "step": 1468 }, { "epoch": 0.6110807789304007, "grad_norm": 1.3970632553100586, "learning_rate": 1.8429230458916672e-05, "loss": 0.877, "step": 1469 }, { "epoch": 0.6114967631230013, "grad_norm": 1.5629993677139282, "learning_rate": 1.84268190537919e-05, "loss": 0.9178, "step": 1470 }, { "epoch": 0.611912747315602, "grad_norm": 1.391983985900879, "learning_rate": 1.842440595713757e-05, "loss": 0.9984, "step": 1471 }, { "epoch": 0.6123287315082027, "grad_norm": 1.3987846374511719, "learning_rate": 1.842199116943806e-05, "loss": 0.8484, "step": 1472 }, { "epoch": 0.6127447157008034, "grad_norm": 1.5616625547409058, "learning_rate": 1.84195746911781e-05, "loss": 0.9863, "step": 1473 }, { "epoch": 0.613160699893404, "grad_norm": 15.835955619812012, "learning_rate": 1.8417156522842756e-05, "loss": 0.8619, "step": 1474 }, { "epoch": 0.6135766840860047, "grad_norm": 1.3932381868362427, "learning_rate": 1.841473666491742e-05, "loss": 0.8847, "step": 1475 }, { "epoch": 0.6139926682786054, "grad_norm": 1.5450211763381958, "learning_rate": 1.8412315117887844e-05, "loss": 0.9497, "step": 1476 }, { "epoch": 0.6144086524712061, "grad_norm": 1.275922179222107, "learning_rate": 1.840989188224011e-05, "loss": 0.9727, "step": 1477 }, { "epoch": 0.6148246366638068, "grad_norm": 1.41254723072052, "learning_rate": 1.840746695846063e-05, "loss": 0.7471, "step": 1478 }, { "epoch": 0.6152406208564074, "grad_norm": 1.5389318466186523, "learning_rate": 1.8405040347036168e-05, "loss": 1.0207, "step": 1479 }, { "epoch": 0.6156566050490081, "grad_norm": 1.3298307657241821, "learning_rate": 1.8402612048453824e-05, "loss": 0.8493, "step": 1480 }, { "epoch": 0.6160725892416088, "grad_norm": 1.4440768957138062, "learning_rate": 1.840018206320103e-05, "loss": 0.8298, "step": 1481 }, { "epoch": 0.6164885734342095, "grad_norm": 1.3836541175842285, "learning_rate": 1.8397750391765564e-05, "loss": 0.8636, "step": 1482 }, { "epoch": 0.6169045576268102, "grad_norm": 484.65472412109375, "learning_rate": 1.839531703463554e-05, "loss": 0.9167, "step": 1483 }, { "epoch": 0.6173205418194109, "grad_norm": 1.4468276500701904, "learning_rate": 1.839288199229941e-05, "loss": 0.9158, "step": 1484 }, { "epoch": 0.6177365260120116, "grad_norm": 1.5263737440109253, "learning_rate": 1.8390445265245958e-05, "loss": 0.8742, "step": 1485 }, { "epoch": 0.6181525102046123, "grad_norm": 1.576399803161621, "learning_rate": 1.8388006853964317e-05, "loss": 0.8168, "step": 1486 }, { "epoch": 0.618568494397213, "grad_norm": 1.619613528251648, "learning_rate": 1.8385566758943956e-05, "loss": 0.9017, "step": 1487 }, { "epoch": 0.6189844785898135, "grad_norm": 1.5125285387039185, "learning_rate": 1.8383124980674674e-05, "loss": 1.011, "step": 1488 }, { "epoch": 0.6194004627824142, "grad_norm": 1.5042436122894287, "learning_rate": 1.838068151964662e-05, "loss": 0.9347, "step": 1489 }, { "epoch": 0.6198164469750149, "grad_norm": 1.4522223472595215, "learning_rate": 1.8378236376350265e-05, "loss": 0.9881, "step": 1490 }, { "epoch": 0.6202324311676156, "grad_norm": 1.4301121234893799, "learning_rate": 1.8375789551276436e-05, "loss": 0.9357, "step": 1491 }, { "epoch": 0.6206484153602163, "grad_norm": 1.6133427619934082, "learning_rate": 1.837334104491628e-05, "loss": 0.9285, "step": 1492 }, { "epoch": 0.621064399552817, "grad_norm": 1.4000422954559326, "learning_rate": 1.8370890857761297e-05, "loss": 0.7982, "step": 1493 }, { "epoch": 0.6214803837454177, "grad_norm": 1.5088982582092285, "learning_rate": 1.836843899030331e-05, "loss": 1.0013, "step": 1494 }, { "epoch": 0.6218963679380184, "grad_norm": 1.3417843580245972, "learning_rate": 1.8365985443034497e-05, "loss": 0.8993, "step": 1495 }, { "epoch": 0.6223123521306191, "grad_norm": 1.5062752962112427, "learning_rate": 1.836353021644735e-05, "loss": 0.9729, "step": 1496 }, { "epoch": 0.6227283363232197, "grad_norm": 1.478127121925354, "learning_rate": 1.8361073311034722e-05, "loss": 0.9395, "step": 1497 }, { "epoch": 0.6231443205158204, "grad_norm": 1.9786403179168701, "learning_rate": 1.8358614727289782e-05, "loss": 0.9315, "step": 1498 }, { "epoch": 0.623560304708421, "grad_norm": 1.485824704170227, "learning_rate": 1.8356154465706053e-05, "loss": 0.8511, "step": 1499 }, { "epoch": 0.6239762889010217, "grad_norm": 1.4129137992858887, "learning_rate": 1.8353692526777384e-05, "loss": 0.9224, "step": 1500 }, { "epoch": 0.6239762889010217, "eval_loss": 0.8369745016098022, "eval_runtime": 1986.9646, "eval_samples_per_second": 3.317, "eval_steps_per_second": 1.659, "step": 1500 }, { "epoch": 0.6243922730936224, "grad_norm": 1.3461018800735474, "learning_rate": 1.8351228910997963e-05, "loss": 0.8711, "step": 1501 }, { "epoch": 0.6248082572862231, "grad_norm": 1.343208909034729, "learning_rate": 1.834876361886232e-05, "loss": 0.7762, "step": 1502 }, { "epoch": 0.6252242414788238, "grad_norm": 1.5384917259216309, "learning_rate": 1.8346296650865314e-05, "loss": 0.8756, "step": 1503 }, { "epoch": 0.6256402256714245, "grad_norm": 11.248188018798828, "learning_rate": 1.8343828007502146e-05, "loss": 0.9349, "step": 1504 }, { "epoch": 0.6260562098640252, "grad_norm": 116.63265991210938, "learning_rate": 1.8341357689268347e-05, "loss": 0.904, "step": 1505 }, { "epoch": 0.6264721940566258, "grad_norm": 1.4618874788284302, "learning_rate": 1.8338885696659795e-05, "loss": 0.8555, "step": 1506 }, { "epoch": 0.6268881782492265, "grad_norm": 51.47650909423828, "learning_rate": 1.8336412030172688e-05, "loss": 1.0674, "step": 1507 }, { "epoch": 0.6273041624418272, "grad_norm": 1.480875849723816, "learning_rate": 1.8333936690303574e-05, "loss": 0.8011, "step": 1508 }, { "epoch": 0.6277201466344279, "grad_norm": 1.5745702981948853, "learning_rate": 1.8331459677549335e-05, "loss": 0.9279, "step": 1509 }, { "epoch": 0.6281361308270286, "grad_norm": 71.94361877441406, "learning_rate": 1.832898099240718e-05, "loss": 0.9926, "step": 1510 }, { "epoch": 0.6285521150196293, "grad_norm": 1.3998315334320068, "learning_rate": 1.8326500635374665e-05, "loss": 0.875, "step": 1511 }, { "epoch": 0.62896809921223, "grad_norm": 99.06598663330078, "learning_rate": 1.8324018606949673e-05, "loss": 0.9597, "step": 1512 }, { "epoch": 0.6293840834048307, "grad_norm": 1.3854135274887085, "learning_rate": 1.832153490763043e-05, "loss": 0.799, "step": 1513 }, { "epoch": 0.6298000675974313, "grad_norm": 1.4779832363128662, "learning_rate": 1.831904953791549e-05, "loss": 0.857, "step": 1514 }, { "epoch": 0.6302160517900319, "grad_norm": 1.513683557510376, "learning_rate": 1.831656249830374e-05, "loss": 0.9666, "step": 1515 }, { "epoch": 0.6306320359826326, "grad_norm": 1.4512884616851807, "learning_rate": 1.8314073789294412e-05, "loss": 0.9171, "step": 1516 }, { "epoch": 0.6310480201752333, "grad_norm": 1.4365081787109375, "learning_rate": 1.8311583411387076e-05, "loss": 0.8916, "step": 1517 }, { "epoch": 0.631464004367834, "grad_norm": 1.4607292413711548, "learning_rate": 1.8309091365081618e-05, "loss": 0.9258, "step": 1518 }, { "epoch": 0.6318799885604347, "grad_norm": 1.3311644792556763, "learning_rate": 1.830659765087828e-05, "loss": 0.8557, "step": 1519 }, { "epoch": 0.6322959727530354, "grad_norm": 1.5570963621139526, "learning_rate": 1.8304102269277627e-05, "loss": 0.9445, "step": 1520 }, { "epoch": 0.6327119569456361, "grad_norm": 1.450318455696106, "learning_rate": 1.8301605220780555e-05, "loss": 0.8883, "step": 1521 }, { "epoch": 0.6331279411382368, "grad_norm": 1.3185594081878662, "learning_rate": 1.8299106505888306e-05, "loss": 0.9155, "step": 1522 }, { "epoch": 0.6335439253308375, "grad_norm": 1.5183050632476807, "learning_rate": 1.8296606125102453e-05, "loss": 0.874, "step": 1523 }, { "epoch": 0.6339599095234381, "grad_norm": 1.4109902381896973, "learning_rate": 1.8294104078924896e-05, "loss": 0.8721, "step": 1524 }, { "epoch": 0.6343758937160388, "grad_norm": 1.4860293865203857, "learning_rate": 1.8291600367857884e-05, "loss": 0.9427, "step": 1525 }, { "epoch": 0.6347918779086394, "grad_norm": 4.205594539642334, "learning_rate": 1.828909499240398e-05, "loss": 0.9124, "step": 1526 }, { "epoch": 0.6352078621012401, "grad_norm": 1.947873592376709, "learning_rate": 1.82865879530661e-05, "loss": 0.9213, "step": 1527 }, { "epoch": 0.6356238462938408, "grad_norm": 1.3853743076324463, "learning_rate": 1.828407925034748e-05, "loss": 1.0012, "step": 1528 }, { "epoch": 0.6360398304864415, "grad_norm": 1.378125548362732, "learning_rate": 1.8281568884751704e-05, "loss": 0.9428, "step": 1529 }, { "epoch": 0.6364558146790422, "grad_norm": 1.385105848312378, "learning_rate": 1.8279056856782678e-05, "loss": 0.6975, "step": 1530 }, { "epoch": 0.6368717988716429, "grad_norm": 1.336798071861267, "learning_rate": 1.8276543166944642e-05, "loss": 0.8584, "step": 1531 }, { "epoch": 0.6372877830642436, "grad_norm": 1.3402284383773804, "learning_rate": 1.827402781574218e-05, "loss": 0.8308, "step": 1532 }, { "epoch": 0.6377037672568442, "grad_norm": 1.6821712255477905, "learning_rate": 1.82715108036802e-05, "loss": 0.9679, "step": 1533 }, { "epoch": 0.6381197514494449, "grad_norm": 1.6113358736038208, "learning_rate": 1.826899213126394e-05, "loss": 0.7587, "step": 1534 }, { "epoch": 0.6385357356420456, "grad_norm": 1.5167955160140991, "learning_rate": 1.826647179899899e-05, "loss": 0.8696, "step": 1535 }, { "epoch": 0.6389517198346463, "grad_norm": 1.4799162149429321, "learning_rate": 1.8263949807391247e-05, "loss": 0.887, "step": 1536 }, { "epoch": 0.639367704027247, "grad_norm": 1.4215210676193237, "learning_rate": 1.8261426156946967e-05, "loss": 0.8262, "step": 1537 }, { "epoch": 0.6397836882198477, "grad_norm": 1.427168846130371, "learning_rate": 1.8258900848172714e-05, "loss": 0.8825, "step": 1538 }, { "epoch": 0.6401996724124484, "grad_norm": 470.330810546875, "learning_rate": 1.8256373881575408e-05, "loss": 0.7456, "step": 1539 }, { "epoch": 0.640615656605049, "grad_norm": 1.2835979461669922, "learning_rate": 1.8253845257662287e-05, "loss": 0.8223, "step": 1540 }, { "epoch": 0.6410316407976497, "grad_norm": 63.74077224731445, "learning_rate": 1.825131497694092e-05, "loss": 0.8987, "step": 1541 }, { "epoch": 0.6414476249902503, "grad_norm": 1.4655559062957764, "learning_rate": 1.824878303991923e-05, "loss": 1.0189, "step": 1542 }, { "epoch": 0.641863609182851, "grad_norm": 32.7840576171875, "learning_rate": 1.8246249447105442e-05, "loss": 0.8609, "step": 1543 }, { "epoch": 0.6422795933754517, "grad_norm": 71.17794799804688, "learning_rate": 1.8243714199008136e-05, "loss": 1.0307, "step": 1544 }, { "epoch": 0.6426955775680524, "grad_norm": 1.3928886651992798, "learning_rate": 1.8241177296136215e-05, "loss": 0.8651, "step": 1545 }, { "epoch": 0.6431115617606531, "grad_norm": 1.4957371950149536, "learning_rate": 1.8238638738998915e-05, "loss": 0.8394, "step": 1546 }, { "epoch": 0.6435275459532538, "grad_norm": 1.5484641790390015, "learning_rate": 1.8236098528105802e-05, "loss": 0.8896, "step": 1547 }, { "epoch": 0.6439435301458545, "grad_norm": 1.4621442556381226, "learning_rate": 1.8233556663966783e-05, "loss": 0.9161, "step": 1548 }, { "epoch": 0.6443595143384552, "grad_norm": 1.547165036201477, "learning_rate": 1.823101314709209e-05, "loss": 1.0156, "step": 1549 }, { "epoch": 0.6447754985310559, "grad_norm": 1.456059217453003, "learning_rate": 1.822846797799228e-05, "loss": 0.8518, "step": 1550 }, { "epoch": 0.6451914827236565, "grad_norm": 1.357054591178894, "learning_rate": 1.8225921157178255e-05, "loss": 0.8197, "step": 1551 }, { "epoch": 0.6456074669162571, "grad_norm": 1.3951103687286377, "learning_rate": 1.8223372685161244e-05, "loss": 0.9016, "step": 1552 }, { "epoch": 0.6460234511088578, "grad_norm": 1.5694949626922607, "learning_rate": 1.82208225624528e-05, "loss": 0.9333, "step": 1553 }, { "epoch": 0.6464394353014585, "grad_norm": 1.4381108283996582, "learning_rate": 1.8218270789564824e-05, "loss": 0.8601, "step": 1554 }, { "epoch": 0.6468554194940592, "grad_norm": 2.5379323959350586, "learning_rate": 1.8215717367009526e-05, "loss": 0.9409, "step": 1555 }, { "epoch": 0.6472714036866599, "grad_norm": 1.6328805685043335, "learning_rate": 1.8213162295299466e-05, "loss": 0.9108, "step": 1556 }, { "epoch": 0.6476873878792606, "grad_norm": 356.79925537109375, "learning_rate": 1.8210605574947525e-05, "loss": 1.0705, "step": 1557 }, { "epoch": 0.6481033720718613, "grad_norm": 1.4453932046890259, "learning_rate": 1.8208047206466918e-05, "loss": 1.001, "step": 1558 }, { "epoch": 0.648519356264462, "grad_norm": 1.5773334503173828, "learning_rate": 1.820548719037119e-05, "loss": 0.939, "step": 1559 }, { "epoch": 0.6489353404570626, "grad_norm": 1.3810123205184937, "learning_rate": 1.820292552717422e-05, "loss": 0.9892, "step": 1560 }, { "epoch": 0.6493513246496633, "grad_norm": 1.4162930250167847, "learning_rate": 1.8200362217390212e-05, "loss": 0.8274, "step": 1561 }, { "epoch": 0.649767308842264, "grad_norm": 1.4349828958511353, "learning_rate": 1.8197797261533707e-05, "loss": 0.8574, "step": 1562 }, { "epoch": 0.6501832930348647, "grad_norm": 1.4438791275024414, "learning_rate": 1.8195230660119566e-05, "loss": 0.8896, "step": 1563 }, { "epoch": 0.6505992772274654, "grad_norm": 1.4300882816314697, "learning_rate": 1.8192662413663e-05, "loss": 0.8289, "step": 1564 }, { "epoch": 0.651015261420066, "grad_norm": 1.5201835632324219, "learning_rate": 1.8190092522679517e-05, "loss": 0.815, "step": 1565 }, { "epoch": 0.6514312456126667, "grad_norm": 1.4472349882125854, "learning_rate": 1.8187520987684997e-05, "loss": 0.8997, "step": 1566 }, { "epoch": 0.6518472298052674, "grad_norm": 1.411281704902649, "learning_rate": 1.8184947809195617e-05, "loss": 0.9247, "step": 1567 }, { "epoch": 0.6522632139978681, "grad_norm": 1.3326748609542847, "learning_rate": 1.8182372987727902e-05, "loss": 0.8022, "step": 1568 }, { "epoch": 0.6526791981904687, "grad_norm": 1.3876920938491821, "learning_rate": 1.817979652379869e-05, "loss": 0.8417, "step": 1569 }, { "epoch": 0.6530951823830694, "grad_norm": 1.4317984580993652, "learning_rate": 1.8177218417925167e-05, "loss": 0.8525, "step": 1570 }, { "epoch": 0.6535111665756701, "grad_norm": 1.4819421768188477, "learning_rate": 1.817463867062484e-05, "loss": 1.0867, "step": 1571 }, { "epoch": 0.6539271507682708, "grad_norm": 1.4282665252685547, "learning_rate": 1.8172057282415544e-05, "loss": 0.8311, "step": 1572 }, { "epoch": 0.6543431349608715, "grad_norm": 1.6222862005233765, "learning_rate": 1.8169474253815444e-05, "loss": 1.0051, "step": 1573 }, { "epoch": 0.6547591191534722, "grad_norm": 1.4603509902954102, "learning_rate": 1.816688958534304e-05, "loss": 0.9139, "step": 1574 }, { "epoch": 0.6551751033460729, "grad_norm": 1.3030455112457275, "learning_rate": 1.816430327751715e-05, "loss": 0.8512, "step": 1575 }, { "epoch": 0.6555910875386736, "grad_norm": 1.4316705465316772, "learning_rate": 1.8161715330856938e-05, "loss": 0.8975, "step": 1576 }, { "epoch": 0.6560070717312743, "grad_norm": 1.3505316972732544, "learning_rate": 1.8159125745881877e-05, "loss": 0.8222, "step": 1577 }, { "epoch": 0.6564230559238748, "grad_norm": 1.459453821182251, "learning_rate": 1.8156534523111788e-05, "loss": 0.922, "step": 1578 }, { "epoch": 0.6568390401164755, "grad_norm": 1.5265191793441772, "learning_rate": 1.8153941663066805e-05, "loss": 0.9337, "step": 1579 }, { "epoch": 0.6572550243090762, "grad_norm": 1.459133267402649, "learning_rate": 1.8151347166267396e-05, "loss": 0.8986, "step": 1580 }, { "epoch": 0.6576710085016769, "grad_norm": 1.5898860692977905, "learning_rate": 1.814875103323436e-05, "loss": 0.9665, "step": 1581 }, { "epoch": 0.6580869926942776, "grad_norm": 1.8575170040130615, "learning_rate": 1.814615326448883e-05, "loss": 0.9643, "step": 1582 }, { "epoch": 0.6585029768868783, "grad_norm": 1.5794239044189453, "learning_rate": 1.8143553860552252e-05, "loss": 0.8526, "step": 1583 }, { "epoch": 0.658918961079479, "grad_norm": 1.5328313112258911, "learning_rate": 1.8140952821946413e-05, "loss": 0.841, "step": 1584 }, { "epoch": 0.6593349452720797, "grad_norm": 1.5085119009017944, "learning_rate": 1.813835014919342e-05, "loss": 0.9229, "step": 1585 }, { "epoch": 0.6597509294646804, "grad_norm": 840.7476806640625, "learning_rate": 1.813574584281572e-05, "loss": 0.9693, "step": 1586 }, { "epoch": 0.660166913657281, "grad_norm": 1.5238796472549438, "learning_rate": 1.813313990333607e-05, "loss": 0.9729, "step": 1587 }, { "epoch": 0.6605828978498817, "grad_norm": 3133.64697265625, "learning_rate": 1.8130532331277567e-05, "loss": 0.8407, "step": 1588 }, { "epoch": 0.6609988820424824, "grad_norm": 1.4733169078826904, "learning_rate": 1.812792312716364e-05, "loss": 0.8825, "step": 1589 }, { "epoch": 0.661414866235083, "grad_norm": 1.5110372304916382, "learning_rate": 1.812531229151803e-05, "loss": 0.9232, "step": 1590 }, { "epoch": 0.6618308504276837, "grad_norm": 1.3970744609832764, "learning_rate": 1.812269982486482e-05, "loss": 0.9435, "step": 1591 }, { "epoch": 0.6622468346202844, "grad_norm": 1.4956984519958496, "learning_rate": 1.8120085727728413e-05, "loss": 0.8966, "step": 1592 }, { "epoch": 0.6626628188128851, "grad_norm": 1.4897863864898682, "learning_rate": 1.8117470000633542e-05, "loss": 0.9464, "step": 1593 }, { "epoch": 0.6630788030054858, "grad_norm": 1.5995010137557983, "learning_rate": 1.8114852644105266e-05, "loss": 0.9233, "step": 1594 }, { "epoch": 0.6634947871980865, "grad_norm": 1.490752100944519, "learning_rate": 1.811223365866897e-05, "loss": 0.8493, "step": 1595 }, { "epoch": 0.6639107713906871, "grad_norm": 1.459678053855896, "learning_rate": 1.810961304485037e-05, "loss": 0.9795, "step": 1596 }, { "epoch": 0.6643267555832878, "grad_norm": 1.6411012411117554, "learning_rate": 1.8106990803175506e-05, "loss": 0.9961, "step": 1597 }, { "epoch": 0.6647427397758885, "grad_norm": 1.6603918075561523, "learning_rate": 1.8104366934170743e-05, "loss": 0.9596, "step": 1598 }, { "epoch": 0.6651587239684892, "grad_norm": 8.074029922485352, "learning_rate": 1.8101741438362778e-05, "loss": 0.8449, "step": 1599 }, { "epoch": 0.6655747081610899, "grad_norm": 1.546288251876831, "learning_rate": 1.8099114316278622e-05, "loss": 1.0059, "step": 1600 }, { "epoch": 0.6659906923536906, "grad_norm": 1.410264253616333, "learning_rate": 1.809648556844563e-05, "loss": 0.8784, "step": 1601 }, { "epoch": 0.6664066765462913, "grad_norm": 1.4466936588287354, "learning_rate": 1.8093855195391476e-05, "loss": 0.8761, "step": 1602 }, { "epoch": 0.666822660738892, "grad_norm": 1.4022538661956787, "learning_rate": 1.8091223197644157e-05, "loss": 0.8905, "step": 1603 }, { "epoch": 0.6672386449314927, "grad_norm": 1.3400168418884277, "learning_rate": 1.8088589575731996e-05, "loss": 0.8801, "step": 1604 }, { "epoch": 0.6676546291240932, "grad_norm": 1.4133840799331665, "learning_rate": 1.8085954330183645e-05, "loss": 0.8891, "step": 1605 }, { "epoch": 0.6680706133166939, "grad_norm": 1.5441828966140747, "learning_rate": 1.8083317461528083e-05, "loss": 1.0936, "step": 1606 }, { "epoch": 0.6684865975092946, "grad_norm": 1.3965868949890137, "learning_rate": 1.8080678970294613e-05, "loss": 0.9445, "step": 1607 }, { "epoch": 0.6689025817018953, "grad_norm": 1.4509074687957764, "learning_rate": 1.8078038857012865e-05, "loss": 0.8697, "step": 1608 }, { "epoch": 0.669318565894496, "grad_norm": 1.4596914052963257, "learning_rate": 1.807539712221279e-05, "loss": 0.8581, "step": 1609 }, { "epoch": 0.6697345500870967, "grad_norm": 1.4609848260879517, "learning_rate": 1.807275376642467e-05, "loss": 0.953, "step": 1610 }, { "epoch": 0.6701505342796974, "grad_norm": 1.5730276107788086, "learning_rate": 1.8070108790179107e-05, "loss": 0.9213, "step": 1611 }, { "epoch": 0.6705665184722981, "grad_norm": 2.0116114616394043, "learning_rate": 1.8067462194007034e-05, "loss": 0.8503, "step": 1612 }, { "epoch": 0.6709825026648988, "grad_norm": 1.4465826749801636, "learning_rate": 1.806481397843971e-05, "loss": 0.9959, "step": 1613 }, { "epoch": 0.6713984868574994, "grad_norm": 1.554391622543335, "learning_rate": 1.8062164144008713e-05, "loss": 0.9038, "step": 1614 }, { "epoch": 0.6718144710501001, "grad_norm": 6.912044525146484, "learning_rate": 1.8059512691245945e-05, "loss": 0.9748, "step": 1615 }, { "epoch": 0.6722304552427008, "grad_norm": 1.4732089042663574, "learning_rate": 1.8056859620683644e-05, "loss": 0.9217, "step": 1616 }, { "epoch": 0.6726464394353014, "grad_norm": 79.76143646240234, "learning_rate": 1.805420493285436e-05, "loss": 0.814, "step": 1617 }, { "epoch": 0.6730624236279021, "grad_norm": 1.5035533905029297, "learning_rate": 1.8051548628290974e-05, "loss": 0.9655, "step": 1618 }, { "epoch": 0.6734784078205028, "grad_norm": 1.4644068479537964, "learning_rate": 1.804889070752669e-05, "loss": 0.92, "step": 1619 }, { "epoch": 0.6738943920131035, "grad_norm": 1.5275256633758545, "learning_rate": 1.804623117109504e-05, "loss": 0.875, "step": 1620 }, { "epoch": 0.6743103762057042, "grad_norm": 6.078028202056885, "learning_rate": 1.8043570019529872e-05, "loss": 0.945, "step": 1621 }, { "epoch": 0.6747263603983049, "grad_norm": 1.4589824676513672, "learning_rate": 1.8040907253365367e-05, "loss": 0.9993, "step": 1622 }, { "epoch": 0.6751423445909055, "grad_norm": 1.5416120290756226, "learning_rate": 1.803824287313603e-05, "loss": 0.8889, "step": 1623 }, { "epoch": 0.6755583287835062, "grad_norm": 1.3892624378204346, "learning_rate": 1.803557687937668e-05, "loss": 0.9475, "step": 1624 }, { "epoch": 0.6759743129761069, "grad_norm": 1.4611111879348755, "learning_rate": 1.8032909272622463e-05, "loss": 0.8825, "step": 1625 }, { "epoch": 0.6763902971687076, "grad_norm": 1.5887739658355713, "learning_rate": 1.803024005340886e-05, "loss": 0.9202, "step": 1626 }, { "epoch": 0.6768062813613083, "grad_norm": 1.5593563318252563, "learning_rate": 1.8027569222271673e-05, "loss": 0.9378, "step": 1627 }, { "epoch": 0.677222265553909, "grad_norm": 1.5099854469299316, "learning_rate": 1.802489677974701e-05, "loss": 0.9344, "step": 1628 }, { "epoch": 0.6776382497465097, "grad_norm": 1.5854697227478027, "learning_rate": 1.802222272637132e-05, "loss": 1.0099, "step": 1629 }, { "epoch": 0.6780542339391104, "grad_norm": 1.5500190258026123, "learning_rate": 1.8019547062681374e-05, "loss": 0.9435, "step": 1630 }, { "epoch": 0.678470218131711, "grad_norm": 1.551085114479065, "learning_rate": 1.8016869789214257e-05, "loss": 0.8569, "step": 1631 }, { "epoch": 0.6788862023243116, "grad_norm": 1.483437418937683, "learning_rate": 1.801419090650738e-05, "loss": 0.8701, "step": 1632 }, { "epoch": 0.6793021865169123, "grad_norm": 1.3734341859817505, "learning_rate": 1.801151041509849e-05, "loss": 0.8571, "step": 1633 }, { "epoch": 0.679718170709513, "grad_norm": 1.5610358715057373, "learning_rate": 1.8008828315525637e-05, "loss": 0.8874, "step": 1634 }, { "epoch": 0.6801341549021137, "grad_norm": 1.5940457582473755, "learning_rate": 1.8006144608327208e-05, "loss": 0.9346, "step": 1635 }, { "epoch": 0.6805501390947144, "grad_norm": 1.3490828275680542, "learning_rate": 1.8003459294041907e-05, "loss": 0.9355, "step": 1636 }, { "epoch": 0.6809661232873151, "grad_norm": 1.3931859731674194, "learning_rate": 1.8000772373208763e-05, "loss": 0.8832, "step": 1637 }, { "epoch": 0.6813821074799158, "grad_norm": 1.5041691064834595, "learning_rate": 1.7998083846367125e-05, "loss": 0.9236, "step": 1638 }, { "epoch": 0.6817980916725165, "grad_norm": 1.5007191896438599, "learning_rate": 1.799539371405666e-05, "loss": 1.0181, "step": 1639 }, { "epoch": 0.6822140758651172, "grad_norm": 1.5273197889328003, "learning_rate": 1.7992701976817376e-05, "loss": 0.8726, "step": 1640 }, { "epoch": 0.6826300600577178, "grad_norm": 1.347548246383667, "learning_rate": 1.7990008635189578e-05, "loss": 0.859, "step": 1641 }, { "epoch": 0.6830460442503185, "grad_norm": 1.5179802179336548, "learning_rate": 1.798731368971391e-05, "loss": 0.9949, "step": 1642 }, { "epoch": 0.6834620284429191, "grad_norm": 1.5117051601409912, "learning_rate": 1.798461714093133e-05, "loss": 0.9022, "step": 1643 }, { "epoch": 0.6838780126355198, "grad_norm": 1.4363315105438232, "learning_rate": 1.7981918989383123e-05, "loss": 0.9, "step": 1644 }, { "epoch": 0.6842939968281205, "grad_norm": 1.5309438705444336, "learning_rate": 1.7979219235610896e-05, "loss": 0.9112, "step": 1645 }, { "epoch": 0.6847099810207212, "grad_norm": 1.373647689819336, "learning_rate": 1.797651788015657e-05, "loss": 0.9391, "step": 1646 }, { "epoch": 0.6851259652133219, "grad_norm": 1.3757283687591553, "learning_rate": 1.7973814923562397e-05, "loss": 0.9259, "step": 1647 }, { "epoch": 0.6855419494059226, "grad_norm": 1.3516490459442139, "learning_rate": 1.7971110366370944e-05, "loss": 0.7178, "step": 1648 }, { "epoch": 0.6859579335985233, "grad_norm": 1.4557592868804932, "learning_rate": 1.79684042091251e-05, "loss": 1.0312, "step": 1649 }, { "epoch": 0.6863739177911239, "grad_norm": 1.4894040822982788, "learning_rate": 1.796569645236808e-05, "loss": 0.9988, "step": 1650 }, { "epoch": 0.6867899019837246, "grad_norm": 1.500855803489685, "learning_rate": 1.7962987096643412e-05, "loss": 0.785, "step": 1651 }, { "epoch": 0.6872058861763253, "grad_norm": 1.5093514919281006, "learning_rate": 1.7960276142494953e-05, "loss": 0.9482, "step": 1652 }, { "epoch": 0.687621870368926, "grad_norm": 1.4174067974090576, "learning_rate": 1.7957563590466875e-05, "loss": 0.8434, "step": 1653 }, { "epoch": 0.6880378545615267, "grad_norm": 1.442991018295288, "learning_rate": 1.7954849441103673e-05, "loss": 0.9, "step": 1654 }, { "epoch": 0.6884538387541274, "grad_norm": 1.400312900543213, "learning_rate": 1.7952133694950168e-05, "loss": 0.8863, "step": 1655 }, { "epoch": 0.688869822946728, "grad_norm": 1.4110233783721924, "learning_rate": 1.7949416352551493e-05, "loss": 0.794, "step": 1656 }, { "epoch": 0.6892858071393287, "grad_norm": 1.5460880994796753, "learning_rate": 1.79466974144531e-05, "loss": 0.9402, "step": 1657 }, { "epoch": 0.6897017913319294, "grad_norm": 1.4376300573349, "learning_rate": 1.7943976881200776e-05, "loss": 0.9404, "step": 1658 }, { "epoch": 0.69011777552453, "grad_norm": 1.5279051065444946, "learning_rate": 1.7941254753340608e-05, "loss": 0.844, "step": 1659 }, { "epoch": 0.6905337597171307, "grad_norm": 26.669902801513672, "learning_rate": 1.7938531031419016e-05, "loss": 0.8491, "step": 1660 }, { "epoch": 0.6909497439097314, "grad_norm": 1.5423352718353271, "learning_rate": 1.7935805715982746e-05, "loss": 1.0561, "step": 1661 }, { "epoch": 0.6913657281023321, "grad_norm": 1.6376360654830933, "learning_rate": 1.7933078807578843e-05, "loss": 0.9601, "step": 1662 }, { "epoch": 0.6917817122949328, "grad_norm": 1.4892045259475708, "learning_rate": 1.793035030675469e-05, "loss": 1.0483, "step": 1663 }, { "epoch": 0.6921976964875335, "grad_norm": 1.513698697090149, "learning_rate": 1.7927620214057984e-05, "loss": 0.9567, "step": 1664 }, { "epoch": 0.6926136806801342, "grad_norm": 1.532441258430481, "learning_rate": 1.792488853003674e-05, "loss": 0.9206, "step": 1665 }, { "epoch": 0.6930296648727349, "grad_norm": 1.35043466091156, "learning_rate": 1.792215525523929e-05, "loss": 0.9049, "step": 1666 }, { "epoch": 0.6934456490653356, "grad_norm": 1.43630051612854, "learning_rate": 1.7919420390214294e-05, "loss": 0.9547, "step": 1667 }, { "epoch": 0.6938616332579362, "grad_norm": 1.3142260313034058, "learning_rate": 1.7916683935510727e-05, "loss": 0.7885, "step": 1668 }, { "epoch": 0.6942776174505368, "grad_norm": 1.4360215663909912, "learning_rate": 1.7913945891677876e-05, "loss": 0.9204, "step": 1669 }, { "epoch": 0.6946936016431375, "grad_norm": 1.4242238998413086, "learning_rate": 1.791120625926536e-05, "loss": 0.9519, "step": 1670 }, { "epoch": 0.6951095858357382, "grad_norm": 1.4459933042526245, "learning_rate": 1.79084650388231e-05, "loss": 0.8635, "step": 1671 }, { "epoch": 0.6955255700283389, "grad_norm": 1.4400769472122192, "learning_rate": 1.7905722230901358e-05, "loss": 0.8886, "step": 1672 }, { "epoch": 0.6959415542209396, "grad_norm": 1.555098533630371, "learning_rate": 1.7902977836050696e-05, "loss": 1.0834, "step": 1673 }, { "epoch": 0.6963575384135403, "grad_norm": 1.572966456413269, "learning_rate": 1.7900231854822e-05, "loss": 0.9934, "step": 1674 }, { "epoch": 0.696773522606141, "grad_norm": 1.5428731441497803, "learning_rate": 1.7897484287766476e-05, "loss": 0.8078, "step": 1675 }, { "epoch": 0.6971895067987417, "grad_norm": 1.5048155784606934, "learning_rate": 1.789473513543565e-05, "loss": 0.8822, "step": 1676 }, { "epoch": 0.6976054909913423, "grad_norm": 1.472704529762268, "learning_rate": 1.7891984398381364e-05, "loss": 0.7869, "step": 1677 }, { "epoch": 0.698021475183943, "grad_norm": 1.5892473459243774, "learning_rate": 1.788923207715578e-05, "loss": 0.9008, "step": 1678 }, { "epoch": 0.6984374593765437, "grad_norm": 1.3708387613296509, "learning_rate": 1.788647817231137e-05, "loss": 0.919, "step": 1679 }, { "epoch": 0.6988534435691444, "grad_norm": 1.4796563386917114, "learning_rate": 1.7883722684400934e-05, "loss": 0.8973, "step": 1680 }, { "epoch": 0.699269427761745, "grad_norm": 1.6067992448806763, "learning_rate": 1.788096561397758e-05, "loss": 0.9987, "step": 1681 }, { "epoch": 0.6996854119543457, "grad_norm": 1.4426524639129639, "learning_rate": 1.787820696159475e-05, "loss": 0.9347, "step": 1682 }, { "epoch": 0.7001013961469464, "grad_norm": 1.514463186264038, "learning_rate": 1.7875446727806186e-05, "loss": 0.9296, "step": 1683 }, { "epoch": 0.7005173803395471, "grad_norm": 243.21641540527344, "learning_rate": 1.787268491316595e-05, "loss": 0.9393, "step": 1684 }, { "epoch": 0.7009333645321478, "grad_norm": 1.4858286380767822, "learning_rate": 1.786992151822844e-05, "loss": 0.8218, "step": 1685 }, { "epoch": 0.7013493487247484, "grad_norm": 1.4520137310028076, "learning_rate": 1.786715654354834e-05, "loss": 0.8954, "step": 1686 }, { "epoch": 0.7017653329173491, "grad_norm": 1.5622197389602661, "learning_rate": 1.786438998968068e-05, "loss": 1.0412, "step": 1687 }, { "epoch": 0.7021813171099498, "grad_norm": 30.238004684448242, "learning_rate": 1.786162185718079e-05, "loss": 0.9605, "step": 1688 }, { "epoch": 0.7025973013025505, "grad_norm": 1.5212817192077637, "learning_rate": 1.7858852146604325e-05, "loss": 0.8828, "step": 1689 }, { "epoch": 0.7030132854951512, "grad_norm": 1.4976688623428345, "learning_rate": 1.785608085850725e-05, "loss": 0.9088, "step": 1690 }, { "epoch": 0.7034292696877519, "grad_norm": 1.4582853317260742, "learning_rate": 1.785330799344585e-05, "loss": 0.9596, "step": 1691 }, { "epoch": 0.7038452538803526, "grad_norm": 1.5751292705535889, "learning_rate": 1.7850533551976733e-05, "loss": 0.9679, "step": 1692 }, { "epoch": 0.7042612380729533, "grad_norm": 1.6586453914642334, "learning_rate": 1.784775753465681e-05, "loss": 0.9637, "step": 1693 }, { "epoch": 0.704677222265554, "grad_norm": 1.4318041801452637, "learning_rate": 1.784497994204332e-05, "loss": 0.9102, "step": 1694 }, { "epoch": 0.7050932064581545, "grad_norm": 1.5090550184249878, "learning_rate": 1.7842200774693813e-05, "loss": 0.8894, "step": 1695 }, { "epoch": 0.7055091906507552, "grad_norm": 1.4875361919403076, "learning_rate": 1.7839420033166153e-05, "loss": 0.8183, "step": 1696 }, { "epoch": 0.7059251748433559, "grad_norm": 1.5408873558044434, "learning_rate": 1.783663771801853e-05, "loss": 0.809, "step": 1697 }, { "epoch": 0.7063411590359566, "grad_norm": 1.7006555795669556, "learning_rate": 1.7833853829809434e-05, "loss": 0.9344, "step": 1698 }, { "epoch": 0.7067571432285573, "grad_norm": 1.4673486948013306, "learning_rate": 1.7831068369097685e-05, "loss": 0.8435, "step": 1699 }, { "epoch": 0.707173127421158, "grad_norm": 1.4889215230941772, "learning_rate": 1.782828133644241e-05, "loss": 0.8493, "step": 1700 }, { "epoch": 0.7075891116137587, "grad_norm": 1.447808861732483, "learning_rate": 1.782549273240306e-05, "loss": 0.8096, "step": 1701 }, { "epoch": 0.7080050958063594, "grad_norm": 1.5296701192855835, "learning_rate": 1.782270255753939e-05, "loss": 0.9456, "step": 1702 }, { "epoch": 0.7084210799989601, "grad_norm": 1.6064810752868652, "learning_rate": 1.7819910812411484e-05, "loss": 0.9682, "step": 1703 }, { "epoch": 0.7088370641915607, "grad_norm": 1.431450366973877, "learning_rate": 1.7817117497579726e-05, "loss": 0.871, "step": 1704 }, { "epoch": 0.7092530483841614, "grad_norm": 4.048548698425293, "learning_rate": 1.7814322613604826e-05, "loss": 0.789, "step": 1705 }, { "epoch": 0.7096690325767621, "grad_norm": 1.4525556564331055, "learning_rate": 1.7811526161047806e-05, "loss": 0.9609, "step": 1706 }, { "epoch": 0.7100850167693628, "grad_norm": 1.3532090187072754, "learning_rate": 1.780872814047e-05, "loss": 0.9176, "step": 1707 }, { "epoch": 0.7105010009619634, "grad_norm": 10.308806419372559, "learning_rate": 1.7805928552433064e-05, "loss": 1.0589, "step": 1708 }, { "epoch": 0.7109169851545641, "grad_norm": 1.3866636753082275, "learning_rate": 1.7803127397498965e-05, "loss": 0.7543, "step": 1709 }, { "epoch": 0.7113329693471648, "grad_norm": 1.481532096862793, "learning_rate": 1.7800324676229975e-05, "loss": 0.8841, "step": 1710 }, { "epoch": 0.7117489535397655, "grad_norm": 1.2921556234359741, "learning_rate": 1.7797520389188697e-05, "loss": 0.7527, "step": 1711 }, { "epoch": 0.7121649377323662, "grad_norm": 1.8216936588287354, "learning_rate": 1.7794714536938037e-05, "loss": 0.9471, "step": 1712 }, { "epoch": 0.7125809219249668, "grad_norm": 1.431530237197876, "learning_rate": 1.7791907120041215e-05, "loss": 0.7852, "step": 1713 }, { "epoch": 0.7129969061175675, "grad_norm": 1.5060746669769287, "learning_rate": 1.7789098139061776e-05, "loss": 0.9288, "step": 1714 }, { "epoch": 0.7134128903101682, "grad_norm": 1.5067880153656006, "learning_rate": 1.778628759456357e-05, "loss": 0.9018, "step": 1715 }, { "epoch": 0.7138288745027689, "grad_norm": 1.497389554977417, "learning_rate": 1.7783475487110754e-05, "loss": 0.8422, "step": 1716 }, { "epoch": 0.7142448586953696, "grad_norm": 1.4436801671981812, "learning_rate": 1.778066181726782e-05, "loss": 0.944, "step": 1717 }, { "epoch": 0.7146608428879703, "grad_norm": 1.4577909708023071, "learning_rate": 1.777784658559955e-05, "loss": 0.9194, "step": 1718 }, { "epoch": 0.715076827080571, "grad_norm": 1.6651705503463745, "learning_rate": 1.7775029792671055e-05, "loss": 1.077, "step": 1719 }, { "epoch": 0.7154928112731717, "grad_norm": 1.6995283365249634, "learning_rate": 1.7772211439047754e-05, "loss": 0.9596, "step": 1720 }, { "epoch": 0.7159087954657724, "grad_norm": 1.5481748580932617, "learning_rate": 1.776939152529538e-05, "loss": 0.8792, "step": 1721 }, { "epoch": 0.7163247796583729, "grad_norm": 1.4868000745773315, "learning_rate": 1.7766570051979974e-05, "loss": 0.914, "step": 1722 }, { "epoch": 0.7167407638509736, "grad_norm": 1.3931506872177124, "learning_rate": 1.7763747019667904e-05, "loss": 0.8333, "step": 1723 }, { "epoch": 0.7171567480435743, "grad_norm": 4.3085551261901855, "learning_rate": 1.7760922428925838e-05, "loss": 0.8433, "step": 1724 }, { "epoch": 0.717572732236175, "grad_norm": 296.0108642578125, "learning_rate": 1.775809628032076e-05, "loss": 0.9585, "step": 1725 }, { "epoch": 0.7179887164287757, "grad_norm": 1.5024915933609009, "learning_rate": 1.775526857441997e-05, "loss": 0.8846, "step": 1726 }, { "epoch": 0.7184047006213764, "grad_norm": 1.5414049625396729, "learning_rate": 1.7752439311791072e-05, "loss": 0.8901, "step": 1727 }, { "epoch": 0.7188206848139771, "grad_norm": 1.4161615371704102, "learning_rate": 1.7749608493001994e-05, "loss": 0.9049, "step": 1728 }, { "epoch": 0.7192366690065778, "grad_norm": 1.5662083625793457, "learning_rate": 1.7746776118620973e-05, "loss": 0.8892, "step": 1729 }, { "epoch": 0.7196526531991785, "grad_norm": 1.5256218910217285, "learning_rate": 1.774394218921655e-05, "loss": 0.8687, "step": 1730 }, { "epoch": 0.7200686373917791, "grad_norm": 1.4419583082199097, "learning_rate": 1.7741106705357588e-05, "loss": 0.8361, "step": 1731 }, { "epoch": 0.7204846215843798, "grad_norm": 1.5782448053359985, "learning_rate": 1.773826966761326e-05, "loss": 0.9253, "step": 1732 }, { "epoch": 0.7209006057769805, "grad_norm": 1.345860242843628, "learning_rate": 1.7735431076553043e-05, "loss": 0.864, "step": 1733 }, { "epoch": 0.7213165899695811, "grad_norm": 1.5824897289276123, "learning_rate": 1.773259093274674e-05, "loss": 1.0577, "step": 1734 }, { "epoch": 0.7217325741621818, "grad_norm": 1.360946774482727, "learning_rate": 1.7729749236764457e-05, "loss": 0.8188, "step": 1735 }, { "epoch": 0.7221485583547825, "grad_norm": 1.48318350315094, "learning_rate": 1.7726905989176604e-05, "loss": 0.8204, "step": 1736 }, { "epoch": 0.7225645425473832, "grad_norm": 1.5465515851974487, "learning_rate": 1.772406119055392e-05, "loss": 1.02, "step": 1737 }, { "epoch": 0.7229805267399839, "grad_norm": 1.3639615774154663, "learning_rate": 1.7721214841467443e-05, "loss": 0.7656, "step": 1738 }, { "epoch": 0.7233965109325846, "grad_norm": 1.4149566888809204, "learning_rate": 1.7718366942488526e-05, "loss": 0.9758, "step": 1739 }, { "epoch": 0.7238124951251852, "grad_norm": 1.435463786125183, "learning_rate": 1.771551749418883e-05, "loss": 0.9576, "step": 1740 }, { "epoch": 0.7242284793177859, "grad_norm": 1.4481515884399414, "learning_rate": 1.771266649714033e-05, "loss": 0.9587, "step": 1741 }, { "epoch": 0.7246444635103866, "grad_norm": 1.4625165462493896, "learning_rate": 1.7709813951915318e-05, "loss": 0.8976, "step": 1742 }, { "epoch": 0.7250604477029873, "grad_norm": 1.474661946296692, "learning_rate": 1.770695985908638e-05, "loss": 0.9143, "step": 1743 }, { "epoch": 0.725476431895588, "grad_norm": 1.4984697103500366, "learning_rate": 1.770410421922643e-05, "loss": 0.8628, "step": 1744 }, { "epoch": 0.7258924160881887, "grad_norm": 1.5861927270889282, "learning_rate": 1.7701247032908685e-05, "loss": 0.9746, "step": 1745 }, { "epoch": 0.7263084002807894, "grad_norm": 9.739063262939453, "learning_rate": 1.769838830070667e-05, "loss": 0.7772, "step": 1746 }, { "epoch": 0.72672438447339, "grad_norm": 1.6132766008377075, "learning_rate": 1.7695528023194222e-05, "loss": 0.9079, "step": 1747 }, { "epoch": 0.7271403686659907, "grad_norm": 1.4865694046020508, "learning_rate": 1.7692666200945493e-05, "loss": 0.8733, "step": 1748 }, { "epoch": 0.7275563528585913, "grad_norm": 1.6608200073242188, "learning_rate": 1.7689802834534942e-05, "loss": 0.9128, "step": 1749 }, { "epoch": 0.727972337051192, "grad_norm": 1.6812273263931274, "learning_rate": 1.7686937924537332e-05, "loss": 0.9118, "step": 1750 }, { "epoch": 0.7283883212437927, "grad_norm": 49212.84375, "learning_rate": 1.7684071471527747e-05, "loss": 0.9265, "step": 1751 }, { "epoch": 0.7288043054363934, "grad_norm": 1.671183705329895, "learning_rate": 1.768120347608157e-05, "loss": 0.9238, "step": 1752 }, { "epoch": 0.7292202896289941, "grad_norm": 1.4157012701034546, "learning_rate": 1.7678333938774506e-05, "loss": 0.77, "step": 1753 }, { "epoch": 0.7296362738215948, "grad_norm": 1.6364142894744873, "learning_rate": 1.7675462860182554e-05, "loss": 0.9648, "step": 1754 }, { "epoch": 0.7300522580141955, "grad_norm": 1.5400967597961426, "learning_rate": 1.767259024088203e-05, "loss": 0.7678, "step": 1755 }, { "epoch": 0.7304682422067962, "grad_norm": 2.1786751747131348, "learning_rate": 1.7669716081449572e-05, "loss": 0.9327, "step": 1756 }, { "epoch": 0.7308842263993969, "grad_norm": 1.517060399055481, "learning_rate": 1.76668403824621e-05, "loss": 0.9107, "step": 1757 }, { "epoch": 0.7313002105919975, "grad_norm": 1.3993613719940186, "learning_rate": 1.7663963144496863e-05, "loss": 0.8292, "step": 1758 }, { "epoch": 0.7317161947845982, "grad_norm": 1.3886295557022095, "learning_rate": 1.7661084368131417e-05, "loss": 0.8977, "step": 1759 }, { "epoch": 0.7321321789771988, "grad_norm": 1.4752553701400757, "learning_rate": 1.765820405394362e-05, "loss": 0.9124, "step": 1760 }, { "epoch": 0.7325481631697995, "grad_norm": 8.513226509094238, "learning_rate": 1.7655322202511646e-05, "loss": 0.9381, "step": 1761 }, { "epoch": 0.7329641473624002, "grad_norm": 1.5348018407821655, "learning_rate": 1.7652438814413967e-05, "loss": 1.1002, "step": 1762 }, { "epoch": 0.7333801315550009, "grad_norm": 1.5456593036651611, "learning_rate": 1.7649553890229377e-05, "loss": 0.8889, "step": 1763 }, { "epoch": 0.7337961157476016, "grad_norm": 1.4213535785675049, "learning_rate": 1.7646667430536972e-05, "loss": 1.0372, "step": 1764 }, { "epoch": 0.7342120999402023, "grad_norm": 262.8396911621094, "learning_rate": 1.7643779435916153e-05, "loss": 0.9371, "step": 1765 }, { "epoch": 0.734628084132803, "grad_norm": 2.686190605163574, "learning_rate": 1.7640889906946633e-05, "loss": 0.9027, "step": 1766 }, { "epoch": 0.7350440683254036, "grad_norm": 1.470332384109497, "learning_rate": 1.763799884420843e-05, "loss": 0.8158, "step": 1767 }, { "epoch": 0.7354600525180043, "grad_norm": 1.3826487064361572, "learning_rate": 1.7635106248281874e-05, "loss": 0.8333, "step": 1768 }, { "epoch": 0.735876036710605, "grad_norm": 1.426018238067627, "learning_rate": 1.76322121197476e-05, "loss": 0.7847, "step": 1769 }, { "epoch": 0.7362920209032057, "grad_norm": 1.4305546283721924, "learning_rate": 1.7629316459186547e-05, "loss": 0.9048, "step": 1770 }, { "epoch": 0.7367080050958064, "grad_norm": 1.5428189039230347, "learning_rate": 1.7626419267179975e-05, "loss": 0.9062, "step": 1771 }, { "epoch": 0.737123989288407, "grad_norm": 1.428858757019043, "learning_rate": 1.762352054430944e-05, "loss": 0.8455, "step": 1772 }, { "epoch": 0.7375399734810077, "grad_norm": 1.5367330312728882, "learning_rate": 1.76206202911568e-05, "loss": 1.0288, "step": 1773 }, { "epoch": 0.7379559576736084, "grad_norm": 1.5545177459716797, "learning_rate": 1.7617718508304233e-05, "loss": 0.974, "step": 1774 }, { "epoch": 0.7383719418662091, "grad_norm": 1.533164381980896, "learning_rate": 1.7614815196334218e-05, "loss": 0.9844, "step": 1775 }, { "epoch": 0.7387879260588097, "grad_norm": 1.379095435142517, "learning_rate": 1.7611910355829546e-05, "loss": 0.9533, "step": 1776 }, { "epoch": 0.7392039102514104, "grad_norm": 1.5948717594146729, "learning_rate": 1.7609003987373305e-05, "loss": 0.8966, "step": 1777 }, { "epoch": 0.7396198944440111, "grad_norm": 1.6331623792648315, "learning_rate": 1.7606096091548893e-05, "loss": 0.9254, "step": 1778 }, { "epoch": 0.7400358786366118, "grad_norm": 1.3614236116409302, "learning_rate": 1.7603186668940024e-05, "loss": 0.7867, "step": 1779 }, { "epoch": 0.7404518628292125, "grad_norm": 1.4389071464538574, "learning_rate": 1.7600275720130708e-05, "loss": 0.8996, "step": 1780 }, { "epoch": 0.7408678470218132, "grad_norm": 1.5783212184906006, "learning_rate": 1.7597363245705265e-05, "loss": 0.9075, "step": 1781 }, { "epoch": 0.7412838312144139, "grad_norm": 1.51686692237854, "learning_rate": 1.7594449246248317e-05, "loss": 0.8983, "step": 1782 }, { "epoch": 0.7416998154070146, "grad_norm": 1.608201503753662, "learning_rate": 1.75915337223448e-05, "loss": 0.9242, "step": 1783 }, { "epoch": 0.7421157995996153, "grad_norm": 1.5803617238998413, "learning_rate": 1.7588616674579956e-05, "loss": 0.9378, "step": 1784 }, { "epoch": 0.7425317837922158, "grad_norm": 1.5570224523544312, "learning_rate": 1.758569810353932e-05, "loss": 0.9787, "step": 1785 }, { "epoch": 0.7429477679848165, "grad_norm": 1.596070647239685, "learning_rate": 1.7582778009808754e-05, "loss": 0.9986, "step": 1786 }, { "epoch": 0.7433637521774172, "grad_norm": 1.5142192840576172, "learning_rate": 1.7579856393974398e-05, "loss": 0.8832, "step": 1787 }, { "epoch": 0.7437797363700179, "grad_norm": 2.0634140968322754, "learning_rate": 1.757693325662272e-05, "loss": 0.8851, "step": 1788 }, { "epoch": 0.7441957205626186, "grad_norm": 1.4633303880691528, "learning_rate": 1.757400859834049e-05, "loss": 0.8006, "step": 1789 }, { "epoch": 0.7446117047552193, "grad_norm": 1.553451418876648, "learning_rate": 1.7571082419714772e-05, "loss": 0.8937, "step": 1790 }, { "epoch": 0.74502768894782, "grad_norm": 1.4932821989059448, "learning_rate": 1.756815472133295e-05, "loss": 0.9366, "step": 1791 }, { "epoch": 0.7454436731404207, "grad_norm": 1.5166339874267578, "learning_rate": 1.7565225503782702e-05, "loss": 0.8456, "step": 1792 }, { "epoch": 0.7458596573330214, "grad_norm": 1.504471778869629, "learning_rate": 1.7562294767652014e-05, "loss": 0.8497, "step": 1793 }, { "epoch": 0.746275641525622, "grad_norm": 11.015772819519043, "learning_rate": 1.755936251352918e-05, "loss": 0.8224, "step": 1794 }, { "epoch": 0.7466916257182227, "grad_norm": 1.552992582321167, "learning_rate": 1.7556428742002797e-05, "loss": 0.9585, "step": 1795 }, { "epoch": 0.7471076099108234, "grad_norm": 1.6966314315795898, "learning_rate": 1.7553493453661764e-05, "loss": 0.9901, "step": 1796 }, { "epoch": 0.7475235941034241, "grad_norm": 1.491487979888916, "learning_rate": 1.7550556649095283e-05, "loss": 0.8285, "step": 1797 }, { "epoch": 0.7479395782960248, "grad_norm": 1.4702329635620117, "learning_rate": 1.7547618328892872e-05, "loss": 0.9969, "step": 1798 }, { "epoch": 0.7483555624886254, "grad_norm": 1.5032457113265991, "learning_rate": 1.7544678493644335e-05, "loss": 0.9723, "step": 1799 }, { "epoch": 0.7487715466812261, "grad_norm": 1.5405749082565308, "learning_rate": 1.7541737143939798e-05, "loss": 0.9447, "step": 1800 }, { "epoch": 0.7491875308738268, "grad_norm": 1.3702938556671143, "learning_rate": 1.7538794280369683e-05, "loss": 0.8093, "step": 1801 }, { "epoch": 0.7496035150664275, "grad_norm": 638.6826171875, "learning_rate": 1.753584990352471e-05, "loss": 0.8555, "step": 1802 }, { "epoch": 0.7500194992590281, "grad_norm": 1.5753099918365479, "learning_rate": 1.7532904013995912e-05, "loss": 0.9804, "step": 1803 }, { "epoch": 0.7504354834516288, "grad_norm": 1.424763798713684, "learning_rate": 1.7529956612374625e-05, "loss": 0.9029, "step": 1804 }, { "epoch": 0.7508514676442295, "grad_norm": 1.4494051933288574, "learning_rate": 1.7527007699252484e-05, "loss": 0.9063, "step": 1805 }, { "epoch": 0.7512674518368302, "grad_norm": 1.4325536489486694, "learning_rate": 1.7524057275221428e-05, "loss": 0.8961, "step": 1806 }, { "epoch": 0.7516834360294309, "grad_norm": 1.4856685400009155, "learning_rate": 1.7521105340873696e-05, "loss": 0.899, "step": 1807 }, { "epoch": 0.7520994202220316, "grad_norm": 1.6380524635314941, "learning_rate": 1.7518151896801846e-05, "loss": 0.9372, "step": 1808 }, { "epoch": 0.7525154044146323, "grad_norm": 1.5714690685272217, "learning_rate": 1.751519694359872e-05, "loss": 0.958, "step": 1809 }, { "epoch": 0.752931388607233, "grad_norm": 1.4809311628341675, "learning_rate": 1.751224048185747e-05, "loss": 0.8395, "step": 1810 }, { "epoch": 0.7533473727998337, "grad_norm": 1.501021146774292, "learning_rate": 1.7509282512171556e-05, "loss": 0.8561, "step": 1811 }, { "epoch": 0.7537633569924342, "grad_norm": 1.5322998762130737, "learning_rate": 1.7506323035134735e-05, "loss": 0.7975, "step": 1812 }, { "epoch": 0.7541793411850349, "grad_norm": 1.6150383949279785, "learning_rate": 1.7503362051341068e-05, "loss": 0.8953, "step": 1813 }, { "epoch": 0.7545953253776356, "grad_norm": 1.4630345106124878, "learning_rate": 1.7500399561384913e-05, "loss": 0.8685, "step": 1814 }, { "epoch": 0.7550113095702363, "grad_norm": 1.4279214143753052, "learning_rate": 1.7497435565860942e-05, "loss": 0.9615, "step": 1815 }, { "epoch": 0.755427293762837, "grad_norm": 1.4683743715286255, "learning_rate": 1.7494470065364124e-05, "loss": 0.8498, "step": 1816 }, { "epoch": 0.7558432779554377, "grad_norm": 1.4232127666473389, "learning_rate": 1.749150306048972e-05, "loss": 0.8956, "step": 1817 }, { "epoch": 0.7562592621480384, "grad_norm": 1.5271506309509277, "learning_rate": 1.7488534551833312e-05, "loss": 0.9738, "step": 1818 }, { "epoch": 0.7566752463406391, "grad_norm": 1.509250521659851, "learning_rate": 1.7485564539990767e-05, "loss": 0.8658, "step": 1819 }, { "epoch": 0.7570912305332398, "grad_norm": 1.3956935405731201, "learning_rate": 1.7482593025558262e-05, "loss": 0.8884, "step": 1820 }, { "epoch": 0.7575072147258404, "grad_norm": 1.4551564455032349, "learning_rate": 1.7479620009132278e-05, "loss": 0.825, "step": 1821 }, { "epoch": 0.7579231989184411, "grad_norm": 1.5511674880981445, "learning_rate": 1.7476645491309587e-05, "loss": 0.9092, "step": 1822 }, { "epoch": 0.7583391831110418, "grad_norm": 1.3388762474060059, "learning_rate": 1.7473669472687275e-05, "loss": 0.8256, "step": 1823 }, { "epoch": 0.7587551673036425, "grad_norm": 1.553422451019287, "learning_rate": 1.747069195386272e-05, "loss": 0.9953, "step": 1824 }, { "epoch": 0.7591711514962431, "grad_norm": 1.5095100402832031, "learning_rate": 1.7467712935433603e-05, "loss": 1.0061, "step": 1825 }, { "epoch": 0.7595871356888438, "grad_norm": 1.5014005899429321, "learning_rate": 1.7464732417997908e-05, "loss": 0.8344, "step": 1826 }, { "epoch": 0.7600031198814445, "grad_norm": 1.4992750883102417, "learning_rate": 1.7461750402153926e-05, "loss": 0.8188, "step": 1827 }, { "epoch": 0.7604191040740452, "grad_norm": 1.4118176698684692, "learning_rate": 1.7458766888500235e-05, "loss": 1.0139, "step": 1828 }, { "epoch": 0.7608350882666459, "grad_norm": 1.4029289484024048, "learning_rate": 1.7455781877635717e-05, "loss": 0.7901, "step": 1829 }, { "epoch": 0.7612510724592465, "grad_norm": 1.5738275051116943, "learning_rate": 1.7452795370159567e-05, "loss": 0.9331, "step": 1830 }, { "epoch": 0.7616670566518472, "grad_norm": 21.001829147338867, "learning_rate": 1.744980736667127e-05, "loss": 0.9155, "step": 1831 }, { "epoch": 0.7620830408444479, "grad_norm": 1.4997771978378296, "learning_rate": 1.7446817867770608e-05, "loss": 0.8616, "step": 1832 }, { "epoch": 0.7624990250370486, "grad_norm": 1.4280651807785034, "learning_rate": 1.7443826874057674e-05, "loss": 0.9451, "step": 1833 }, { "epoch": 0.7629150092296493, "grad_norm": 12.48804759979248, "learning_rate": 1.744083438613285e-05, "loss": 0.8734, "step": 1834 }, { "epoch": 0.76333099342225, "grad_norm": 1.52748441696167, "learning_rate": 1.743784040459683e-05, "loss": 0.8405, "step": 1835 }, { "epoch": 0.7637469776148507, "grad_norm": 1.5027996301651, "learning_rate": 1.7434844930050597e-05, "loss": 0.8304, "step": 1836 }, { "epoch": 0.7641629618074514, "grad_norm": 1.620823860168457, "learning_rate": 1.743184796309543e-05, "loss": 1.0042, "step": 1837 }, { "epoch": 0.764578946000052, "grad_norm": 1.5189034938812256, "learning_rate": 1.742884950433293e-05, "loss": 0.9715, "step": 1838 }, { "epoch": 0.7649949301926526, "grad_norm": 1.5877701044082642, "learning_rate": 1.742584955436497e-05, "loss": 0.9168, "step": 1839 }, { "epoch": 0.7654109143852533, "grad_norm": 1.590480923652649, "learning_rate": 1.7422848113793743e-05, "loss": 0.9883, "step": 1840 }, { "epoch": 0.765826898577854, "grad_norm": 1.4616950750350952, "learning_rate": 1.741984518322173e-05, "loss": 0.87, "step": 1841 }, { "epoch": 0.7662428827704547, "grad_norm": 1.400339126586914, "learning_rate": 1.7416840763251714e-05, "loss": 0.8983, "step": 1842 }, { "epoch": 0.7666588669630554, "grad_norm": 36.03741455078125, "learning_rate": 1.741383485448678e-05, "loss": 0.8764, "step": 1843 }, { "epoch": 0.7670748511556561, "grad_norm": 1.499769687652588, "learning_rate": 1.7410827457530305e-05, "loss": 0.9523, "step": 1844 }, { "epoch": 0.7674908353482568, "grad_norm": 1.6136467456817627, "learning_rate": 1.7407818572985973e-05, "loss": 0.8846, "step": 1845 }, { "epoch": 0.7679068195408575, "grad_norm": 1.573909044265747, "learning_rate": 1.7404808201457755e-05, "loss": 0.8263, "step": 1846 }, { "epoch": 0.7683228037334582, "grad_norm": 1.4307018518447876, "learning_rate": 1.740179634354994e-05, "loss": 0.8001, "step": 1847 }, { "epoch": 0.7687387879260588, "grad_norm": 1.5239254236221313, "learning_rate": 1.7398782999867094e-05, "loss": 0.899, "step": 1848 }, { "epoch": 0.7691547721186595, "grad_norm": 1.4850575923919678, "learning_rate": 1.739576817101409e-05, "loss": 0.9942, "step": 1849 }, { "epoch": 0.7695707563112602, "grad_norm": 1.4280385971069336, "learning_rate": 1.739275185759611e-05, "loss": 0.8122, "step": 1850 }, { "epoch": 0.7699867405038608, "grad_norm": 1.4452266693115234, "learning_rate": 1.738973406021861e-05, "loss": 0.8252, "step": 1851 }, { "epoch": 0.7704027246964615, "grad_norm": 1.6754649877548218, "learning_rate": 1.7386714779487364e-05, "loss": 0.8995, "step": 1852 }, { "epoch": 0.7708187088890622, "grad_norm": 1.539137601852417, "learning_rate": 1.7383694016008443e-05, "loss": 0.9555, "step": 1853 }, { "epoch": 0.7712346930816629, "grad_norm": 1.6852155923843384, "learning_rate": 1.73806717703882e-05, "loss": 1.0464, "step": 1854 }, { "epoch": 0.7716506772742636, "grad_norm": 1.6210649013519287, "learning_rate": 1.7377648043233303e-05, "loss": 0.9497, "step": 1855 }, { "epoch": 0.7720666614668643, "grad_norm": 1.5826036930084229, "learning_rate": 1.73746228351507e-05, "loss": 0.9395, "step": 1856 }, { "epoch": 0.7724826456594649, "grad_norm": 7.759859561920166, "learning_rate": 1.737159614674766e-05, "loss": 0.8902, "step": 1857 }, { "epoch": 0.7728986298520656, "grad_norm": 1.424926996231079, "learning_rate": 1.7368567978631724e-05, "loss": 0.9345, "step": 1858 }, { "epoch": 0.7733146140446663, "grad_norm": 7.234941005706787, "learning_rate": 1.7365538331410747e-05, "loss": 0.8949, "step": 1859 }, { "epoch": 0.773730598237267, "grad_norm": 1.4302759170532227, "learning_rate": 1.7362507205692872e-05, "loss": 0.9051, "step": 1860 }, { "epoch": 0.7741465824298677, "grad_norm": 1.3765621185302734, "learning_rate": 1.7359474602086544e-05, "loss": 0.7249, "step": 1861 }, { "epoch": 0.7745625666224684, "grad_norm": 1.6272048950195312, "learning_rate": 1.7356440521200502e-05, "loss": 0.8307, "step": 1862 }, { "epoch": 0.7749785508150691, "grad_norm": 1.5169296264648438, "learning_rate": 1.735340496364378e-05, "loss": 0.8168, "step": 1863 }, { "epoch": 0.7753945350076697, "grad_norm": 1.4673194885253906, "learning_rate": 1.7350367930025715e-05, "loss": 0.8892, "step": 1864 }, { "epoch": 0.7758105192002704, "grad_norm": 1.431227684020996, "learning_rate": 1.7347329420955935e-05, "loss": 0.7804, "step": 1865 }, { "epoch": 0.776226503392871, "grad_norm": 1.4031354188919067, "learning_rate": 1.7344289437044358e-05, "loss": 0.8741, "step": 1866 }, { "epoch": 0.7766424875854717, "grad_norm": 1.6002986431121826, "learning_rate": 1.7341247978901216e-05, "loss": 0.9722, "step": 1867 }, { "epoch": 0.7770584717780724, "grad_norm": 1.4203369617462158, "learning_rate": 1.733820504713702e-05, "loss": 0.9165, "step": 1868 }, { "epoch": 0.7774744559706731, "grad_norm": 1.4480018615722656, "learning_rate": 1.7335160642362584e-05, "loss": 0.9139, "step": 1869 }, { "epoch": 0.7778904401632738, "grad_norm": 1.493922472000122, "learning_rate": 1.7332114765189013e-05, "loss": 0.9845, "step": 1870 }, { "epoch": 0.7783064243558745, "grad_norm": 1.4016940593719482, "learning_rate": 1.7329067416227717e-05, "loss": 0.9247, "step": 1871 }, { "epoch": 0.7787224085484752, "grad_norm": 1.32820725440979, "learning_rate": 1.732601859609039e-05, "loss": 0.7373, "step": 1872 }, { "epoch": 0.7791383927410759, "grad_norm": 1.4538307189941406, "learning_rate": 1.732296830538903e-05, "loss": 0.976, "step": 1873 }, { "epoch": 0.7795543769336766, "grad_norm": 1.676714539527893, "learning_rate": 1.731991654473593e-05, "loss": 0.9179, "step": 1874 }, { "epoch": 0.7799703611262772, "grad_norm": 1.4986591339111328, "learning_rate": 1.7316863314743666e-05, "loss": 0.8992, "step": 1875 }, { "epoch": 0.7803863453188778, "grad_norm": 1.5629459619522095, "learning_rate": 1.7313808616025126e-05, "loss": 0.8447, "step": 1876 }, { "epoch": 0.7808023295114785, "grad_norm": 1.9657471179962158, "learning_rate": 1.7310752449193484e-05, "loss": 1.1093, "step": 1877 }, { "epoch": 0.7812183137040792, "grad_norm": 1.4979051351547241, "learning_rate": 1.7307694814862203e-05, "loss": 0.9125, "step": 1878 }, { "epoch": 0.7816342978966799, "grad_norm": 1.520846962928772, "learning_rate": 1.7304635713645054e-05, "loss": 0.8302, "step": 1879 }, { "epoch": 0.7820502820892806, "grad_norm": 1.437089443206787, "learning_rate": 1.7301575146156096e-05, "loss": 0.8763, "step": 1880 }, { "epoch": 0.7824662662818813, "grad_norm": 1.4300447702407837, "learning_rate": 1.7298513113009675e-05, "loss": 0.8244, "step": 1881 }, { "epoch": 0.782882250474482, "grad_norm": 1.4943194389343262, "learning_rate": 1.7295449614820445e-05, "loss": 0.8693, "step": 1882 }, { "epoch": 0.7832982346670827, "grad_norm": 1.4937654733657837, "learning_rate": 1.7292384652203342e-05, "loss": 0.8754, "step": 1883 }, { "epoch": 0.7837142188596833, "grad_norm": 1.4841773509979248, "learning_rate": 1.7289318225773603e-05, "loss": 0.8624, "step": 1884 }, { "epoch": 0.784130203052284, "grad_norm": 1.325972318649292, "learning_rate": 1.728625033614676e-05, "loss": 0.8031, "step": 1885 }, { "epoch": 0.7845461872448847, "grad_norm": 1.6133582592010498, "learning_rate": 1.728318098393863e-05, "loss": 0.9955, "step": 1886 }, { "epoch": 0.7849621714374854, "grad_norm": 1.4519124031066895, "learning_rate": 1.728011016976534e-05, "loss": 0.8612, "step": 1887 }, { "epoch": 0.7853781556300861, "grad_norm": 1.4661829471588135, "learning_rate": 1.7277037894243283e-05, "loss": 0.8404, "step": 1888 }, { "epoch": 0.7857941398226868, "grad_norm": 1.4067966938018799, "learning_rate": 1.7273964157989174e-05, "loss": 0.8446, "step": 1889 }, { "epoch": 0.7862101240152874, "grad_norm": 1.49893057346344, "learning_rate": 1.7270888961620006e-05, "loss": 0.9134, "step": 1890 }, { "epoch": 0.7866261082078881, "grad_norm": 21.540922164916992, "learning_rate": 1.726781230575307e-05, "loss": 0.9954, "step": 1891 }, { "epoch": 0.7870420924004888, "grad_norm": 2.025999069213867, "learning_rate": 1.7264734191005947e-05, "loss": 0.9456, "step": 1892 }, { "epoch": 0.7874580765930894, "grad_norm": 1.5138031244277954, "learning_rate": 1.726165461799651e-05, "loss": 0.9082, "step": 1893 }, { "epoch": 0.7878740607856901, "grad_norm": 1.4947656393051147, "learning_rate": 1.725857358734293e-05, "loss": 1.0491, "step": 1894 }, { "epoch": 0.7882900449782908, "grad_norm": 21.686803817749023, "learning_rate": 1.725549109966367e-05, "loss": 0.8257, "step": 1895 }, { "epoch": 0.7887060291708915, "grad_norm": 1.3753705024719238, "learning_rate": 1.7252407155577472e-05, "loss": 0.8938, "step": 1896 }, { "epoch": 0.7891220133634922, "grad_norm": 1.5911654233932495, "learning_rate": 1.7249321755703394e-05, "loss": 0.884, "step": 1897 }, { "epoch": 0.7895379975560929, "grad_norm": 1.5317134857177734, "learning_rate": 1.7246234900660765e-05, "loss": 0.9283, "step": 1898 }, { "epoch": 0.7899539817486936, "grad_norm": 1.538807988166809, "learning_rate": 1.7243146591069216e-05, "loss": 0.9097, "step": 1899 }, { "epoch": 0.7903699659412943, "grad_norm": 1.4094719886779785, "learning_rate": 1.7240056827548676e-05, "loss": 0.7812, "step": 1900 }, { "epoch": 0.790785950133895, "grad_norm": 1.621131420135498, "learning_rate": 1.7236965610719346e-05, "loss": 0.9708, "step": 1901 }, { "epoch": 0.7912019343264955, "grad_norm": 1.5534456968307495, "learning_rate": 1.7233872941201745e-05, "loss": 0.9706, "step": 1902 }, { "epoch": 0.7916179185190962, "grad_norm": 1.4167654514312744, "learning_rate": 1.723077881961666e-05, "loss": 0.8915, "step": 1903 }, { "epoch": 0.7920339027116969, "grad_norm": 1.4602535963058472, "learning_rate": 1.722768324658518e-05, "loss": 0.88, "step": 1904 }, { "epoch": 0.7924498869042976, "grad_norm": 1.6566075086593628, "learning_rate": 1.722458622272869e-05, "loss": 0.8027, "step": 1905 }, { "epoch": 0.7928658710968983, "grad_norm": 1.5734806060791016, "learning_rate": 1.7221487748668853e-05, "loss": 0.972, "step": 1906 }, { "epoch": 0.793281855289499, "grad_norm": 1.443605661392212, "learning_rate": 1.7218387825027637e-05, "loss": 0.8488, "step": 1907 }, { "epoch": 0.7936978394820997, "grad_norm": 1.5650112628936768, "learning_rate": 1.7215286452427293e-05, "loss": 0.9522, "step": 1908 }, { "epoch": 0.7941138236747004, "grad_norm": 1.4052625894546509, "learning_rate": 1.721218363149036e-05, "loss": 0.8719, "step": 1909 }, { "epoch": 0.7945298078673011, "grad_norm": 1.466860294342041, "learning_rate": 1.7209079362839685e-05, "loss": 0.9452, "step": 1910 }, { "epoch": 0.7949457920599017, "grad_norm": 1.399592638015747, "learning_rate": 1.7205973647098383e-05, "loss": 0.7956, "step": 1911 }, { "epoch": 0.7953617762525024, "grad_norm": 1.635161280632019, "learning_rate": 1.720286648488987e-05, "loss": 0.98, "step": 1912 }, { "epoch": 0.7957777604451031, "grad_norm": 1.385312557220459, "learning_rate": 1.7199757876837855e-05, "loss": 0.9344, "step": 1913 }, { "epoch": 0.7961937446377038, "grad_norm": 1.5486372709274292, "learning_rate": 1.7196647823566332e-05, "loss": 0.8274, "step": 1914 }, { "epoch": 0.7966097288303045, "grad_norm": 1.5486924648284912, "learning_rate": 1.719353632569959e-05, "loss": 0.9956, "step": 1915 }, { "epoch": 0.7970257130229051, "grad_norm": 1.6778439283370972, "learning_rate": 1.71904233838622e-05, "loss": 0.9009, "step": 1916 }, { "epoch": 0.7974416972155058, "grad_norm": 1.5862053632736206, "learning_rate": 1.7187308998679034e-05, "loss": 0.9236, "step": 1917 }, { "epoch": 0.7978576814081065, "grad_norm": 1.575174331665039, "learning_rate": 1.7184193170775244e-05, "loss": 1.0406, "step": 1918 }, { "epoch": 0.7982736656007072, "grad_norm": 1.6029878854751587, "learning_rate": 1.718107590077628e-05, "loss": 0.9587, "step": 1919 }, { "epoch": 0.7986896497933078, "grad_norm": 1.3748031854629517, "learning_rate": 1.717795718930787e-05, "loss": 0.8145, "step": 1920 }, { "epoch": 0.7991056339859085, "grad_norm": 1.531101107597351, "learning_rate": 1.717483703699604e-05, "loss": 0.9114, "step": 1921 }, { "epoch": 0.7995216181785092, "grad_norm": 1.4682257175445557, "learning_rate": 1.7171715444467106e-05, "loss": 0.8741, "step": 1922 }, { "epoch": 0.7999376023711099, "grad_norm": 17.207897186279297, "learning_rate": 1.716859241234767e-05, "loss": 0.9346, "step": 1923 }, { "epoch": 0.8003535865637106, "grad_norm": 1.4486056566238403, "learning_rate": 1.716546794126462e-05, "loss": 0.7894, "step": 1924 }, { "epoch": 0.8007695707563113, "grad_norm": 1.5702770948410034, "learning_rate": 1.7162342031845144e-05, "loss": 0.8731, "step": 1925 }, { "epoch": 0.801185554948912, "grad_norm": 248.94271850585938, "learning_rate": 1.7159214684716706e-05, "loss": 0.7572, "step": 1926 }, { "epoch": 0.8016015391415127, "grad_norm": 1.4508845806121826, "learning_rate": 1.715608590050706e-05, "loss": 0.8284, "step": 1927 }, { "epoch": 0.8020175233341134, "grad_norm": 1.5757449865341187, "learning_rate": 1.715295567984426e-05, "loss": 1.0003, "step": 1928 }, { "epoch": 0.8024335075267139, "grad_norm": 1.561690092086792, "learning_rate": 1.7149824023356637e-05, "loss": 0.8664, "step": 1929 }, { "epoch": 0.8028494917193146, "grad_norm": 1.5475492477416992, "learning_rate": 1.714669093167281e-05, "loss": 0.9327, "step": 1930 }, { "epoch": 0.8032654759119153, "grad_norm": 1.3828322887420654, "learning_rate": 1.7143556405421694e-05, "loss": 0.8716, "step": 1931 }, { "epoch": 0.803681460104516, "grad_norm": 1.3521815538406372, "learning_rate": 1.7140420445232488e-05, "loss": 0.797, "step": 1932 }, { "epoch": 0.8040974442971167, "grad_norm": 1.6670540571212769, "learning_rate": 1.7137283051734678e-05, "loss": 0.9444, "step": 1933 }, { "epoch": 0.8045134284897174, "grad_norm": 1.5191099643707275, "learning_rate": 1.7134144225558036e-05, "loss": 0.9749, "step": 1934 }, { "epoch": 0.8049294126823181, "grad_norm": 1.335014820098877, "learning_rate": 1.7131003967332623e-05, "loss": 0.9015, "step": 1935 }, { "epoch": 0.8053453968749188, "grad_norm": 1.4985734224319458, "learning_rate": 1.7127862277688793e-05, "loss": 0.852, "step": 1936 }, { "epoch": 0.8057613810675195, "grad_norm": 1.474022388458252, "learning_rate": 1.712471915725718e-05, "loss": 0.8833, "step": 1937 }, { "epoch": 0.8061773652601201, "grad_norm": 1.4523154497146606, "learning_rate": 1.7121574606668708e-05, "loss": 0.8349, "step": 1938 }, { "epoch": 0.8065933494527208, "grad_norm": 1.4327641725540161, "learning_rate": 1.711842862655459e-05, "loss": 0.8432, "step": 1939 }, { "epoch": 0.8070093336453215, "grad_norm": 1.470369815826416, "learning_rate": 1.711528121754632e-05, "loss": 0.8058, "step": 1940 }, { "epoch": 0.8074253178379222, "grad_norm": 1.4616925716400146, "learning_rate": 1.7112132380275688e-05, "loss": 0.8806, "step": 1941 }, { "epoch": 0.8078413020305228, "grad_norm": 1.5584441423416138, "learning_rate": 1.710898211537476e-05, "loss": 0.9801, "step": 1942 }, { "epoch": 0.8082572862231235, "grad_norm": 1.4027948379516602, "learning_rate": 1.7105830423475894e-05, "loss": 0.9941, "step": 1943 }, { "epoch": 0.8086732704157242, "grad_norm": 1.5219323635101318, "learning_rate": 1.710267730521174e-05, "loss": 0.9073, "step": 1944 }, { "epoch": 0.8090892546083249, "grad_norm": 39.779422760009766, "learning_rate": 1.7099522761215223e-05, "loss": 1.0161, "step": 1945 }, { "epoch": 0.8095052388009256, "grad_norm": 1.6682727336883545, "learning_rate": 1.7096366792119562e-05, "loss": 1.0241, "step": 1946 }, { "epoch": 0.8099212229935262, "grad_norm": 1.4567272663116455, "learning_rate": 1.709320939855826e-05, "loss": 0.7906, "step": 1947 }, { "epoch": 0.8103372071861269, "grad_norm": 1.6091563701629639, "learning_rate": 1.7090050581165108e-05, "loss": 0.9414, "step": 1948 }, { "epoch": 0.8107531913787276, "grad_norm": 1.5241812467575073, "learning_rate": 1.708689034057418e-05, "loss": 0.8978, "step": 1949 }, { "epoch": 0.8111691755713283, "grad_norm": 1.4977035522460938, "learning_rate": 1.708372867741983e-05, "loss": 0.8772, "step": 1950 }, { "epoch": 0.811585159763929, "grad_norm": 1.541028380393982, "learning_rate": 1.7080565592336714e-05, "loss": 0.9849, "step": 1951 }, { "epoch": 0.8120011439565297, "grad_norm": 1.5081710815429688, "learning_rate": 1.7077401085959756e-05, "loss": 0.8331, "step": 1952 }, { "epoch": 0.8124171281491304, "grad_norm": 1.4335042238235474, "learning_rate": 1.7074235158924176e-05, "loss": 0.7379, "step": 1953 }, { "epoch": 0.8128331123417311, "grad_norm": 1.5767242908477783, "learning_rate": 1.7071067811865477e-05, "loss": 0.8845, "step": 1954 }, { "epoch": 0.8132490965343318, "grad_norm": 1.451156735420227, "learning_rate": 1.7067899045419442e-05, "loss": 0.9094, "step": 1955 }, { "epoch": 0.8136650807269323, "grad_norm": 1.4401273727416992, "learning_rate": 1.7064728860222147e-05, "loss": 0.8452, "step": 1956 }, { "epoch": 0.814081064919533, "grad_norm": 1.7049559354782104, "learning_rate": 1.7061557256909943e-05, "loss": 0.9661, "step": 1957 }, { "epoch": 0.8144970491121337, "grad_norm": 1.6375006437301636, "learning_rate": 1.705838423611948e-05, "loss": 0.9174, "step": 1958 }, { "epoch": 0.8149130333047344, "grad_norm": 1.6105457544326782, "learning_rate": 1.705520979848767e-05, "loss": 0.9055, "step": 1959 }, { "epoch": 0.8153290174973351, "grad_norm": 2.2199506759643555, "learning_rate": 1.705203394465174e-05, "loss": 0.814, "step": 1960 }, { "epoch": 0.8157450016899358, "grad_norm": 1.4243991374969482, "learning_rate": 1.704885667524917e-05, "loss": 0.7616, "step": 1961 }, { "epoch": 0.8161609858825365, "grad_norm": 1.5150502920150757, "learning_rate": 1.7045677990917745e-05, "loss": 0.9024, "step": 1962 }, { "epoch": 0.8165769700751372, "grad_norm": 1.5837279558181763, "learning_rate": 1.7042497892295532e-05, "loss": 0.9932, "step": 1963 }, { "epoch": 0.8169929542677379, "grad_norm": 1.4727922677993774, "learning_rate": 1.7039316380020873e-05, "loss": 0.867, "step": 1964 }, { "epoch": 0.8174089384603385, "grad_norm": 1.5218515396118164, "learning_rate": 1.7036133454732394e-05, "loss": 0.9221, "step": 1965 }, { "epoch": 0.8178249226529392, "grad_norm": 1.6496292352676392, "learning_rate": 1.7032949117069018e-05, "loss": 0.8877, "step": 1966 }, { "epoch": 0.8182409068455399, "grad_norm": 1.4895879030227661, "learning_rate": 1.7029763367669934e-05, "loss": 0.8742, "step": 1967 }, { "epoch": 0.8186568910381405, "grad_norm": 1.52810800075531, "learning_rate": 1.7026576207174628e-05, "loss": 0.8663, "step": 1968 }, { "epoch": 0.8190728752307412, "grad_norm": 11.159671783447266, "learning_rate": 1.702338763622286e-05, "loss": 0.9623, "step": 1969 }, { "epoch": 0.8194888594233419, "grad_norm": 1.6406031847000122, "learning_rate": 1.7020197655454683e-05, "loss": 0.9065, "step": 1970 }, { "epoch": 0.8199048436159426, "grad_norm": 6649.18896484375, "learning_rate": 1.7017006265510425e-05, "loss": 0.8832, "step": 1971 }, { "epoch": 0.8203208278085433, "grad_norm": 1.675304651260376, "learning_rate": 1.7013813467030696e-05, "loss": 0.9051, "step": 1972 }, { "epoch": 0.820736812001144, "grad_norm": 1.505752682685852, "learning_rate": 1.7010619260656397e-05, "loss": 0.8683, "step": 1973 }, { "epoch": 0.8211527961937446, "grad_norm": 1.6246732473373413, "learning_rate": 1.70074236470287e-05, "loss": 0.9187, "step": 1974 }, { "epoch": 0.8215687803863453, "grad_norm": 1.4471116065979004, "learning_rate": 1.700422662678907e-05, "loss": 0.8996, "step": 1975 }, { "epoch": 0.821984764578946, "grad_norm": 8.592826843261719, "learning_rate": 1.700102820057925e-05, "loss": 0.889, "step": 1976 }, { "epoch": 0.8224007487715467, "grad_norm": 1.317482352256775, "learning_rate": 1.6997828369041266e-05, "loss": 0.6771, "step": 1977 }, { "epoch": 0.8228167329641474, "grad_norm": 1.567434549331665, "learning_rate": 1.699462713281742e-05, "loss": 0.9089, "step": 1978 }, { "epoch": 0.8232327171567481, "grad_norm": 1.5294548273086548, "learning_rate": 1.699142449255031e-05, "loss": 0.9248, "step": 1979 }, { "epoch": 0.8236487013493488, "grad_norm": 1.6443114280700684, "learning_rate": 1.6988220448882806e-05, "loss": 0.9584, "step": 1980 }, { "epoch": 0.8240646855419494, "grad_norm": 1.6321595907211304, "learning_rate": 1.698501500245805e-05, "loss": 1.0546, "step": 1981 }, { "epoch": 0.8244806697345501, "grad_norm": 1.3640882968902588, "learning_rate": 1.6981808153919496e-05, "loss": 0.8982, "step": 1982 }, { "epoch": 0.8248966539271507, "grad_norm": 1.4965944290161133, "learning_rate": 1.697859990391084e-05, "loss": 0.8158, "step": 1983 }, { "epoch": 0.8253126381197514, "grad_norm": 1.505684733390808, "learning_rate": 1.6975390253076093e-05, "loss": 0.8559, "step": 1984 }, { "epoch": 0.8257286223123521, "grad_norm": 1.5006917715072632, "learning_rate": 1.6972179202059532e-05, "loss": 0.9261, "step": 1985 }, { "epoch": 0.8261446065049528, "grad_norm": 4.856268405914307, "learning_rate": 1.696896675150571e-05, "loss": 0.862, "step": 1986 }, { "epoch": 0.8265605906975535, "grad_norm": 1.5233510732650757, "learning_rate": 1.6965752902059472e-05, "loss": 0.8999, "step": 1987 }, { "epoch": 0.8269765748901542, "grad_norm": 3977.71875, "learning_rate": 1.696253765436594e-05, "loss": 0.9763, "step": 1988 }, { "epoch": 0.8273925590827549, "grad_norm": 1.5720261335372925, "learning_rate": 1.6959321009070513e-05, "loss": 0.9561, "step": 1989 }, { "epoch": 0.8278085432753556, "grad_norm": 33.386932373046875, "learning_rate": 1.6956102966818877e-05, "loss": 0.8512, "step": 1990 }, { "epoch": 0.8282245274679563, "grad_norm": 1.4342174530029297, "learning_rate": 1.6952883528256995e-05, "loss": 0.794, "step": 1991 }, { "epoch": 0.8286405116605569, "grad_norm": 1.4770219326019287, "learning_rate": 1.6949662694031107e-05, "loss": 0.8127, "step": 1992 }, { "epoch": 0.8290564958531575, "grad_norm": 1.4772542715072632, "learning_rate": 1.6946440464787738e-05, "loss": 0.9228, "step": 1993 }, { "epoch": 0.8294724800457582, "grad_norm": 1.577268123626709, "learning_rate": 1.6943216841173693e-05, "loss": 0.9546, "step": 1994 }, { "epoch": 0.8298884642383589, "grad_norm": 1.99313223361969, "learning_rate": 1.6939991823836055e-05, "loss": 1.0172, "step": 1995 }, { "epoch": 0.8303044484309596, "grad_norm": 1.5835621356964111, "learning_rate": 1.6936765413422187e-05, "loss": 0.9066, "step": 1996 }, { "epoch": 0.8307204326235603, "grad_norm": 1.494693398475647, "learning_rate": 1.693353761057973e-05, "loss": 0.9454, "step": 1997 }, { "epoch": 0.831136416816161, "grad_norm": 1.5390666723251343, "learning_rate": 1.6930308415956608e-05, "loss": 0.7385, "step": 1998 }, { "epoch": 0.8315524010087617, "grad_norm": 1.581026315689087, "learning_rate": 1.6927077830201023e-05, "loss": 0.8306, "step": 1999 }, { "epoch": 0.8319683852013624, "grad_norm": 1.4194647073745728, "learning_rate": 1.6923845853961455e-05, "loss": 0.8911, "step": 2000 }, { "epoch": 0.8319683852013624, "eval_loss": 0.8165808916091919, "eval_runtime": 1843.7584, "eval_samples_per_second": 3.575, "eval_steps_per_second": 1.788, "step": 2000 }, { "epoch": 0.832384369393963, "grad_norm": 1.5326238870620728, "learning_rate": 1.6920612487886666e-05, "loss": 0.8529, "step": 2001 }, { "epoch": 0.8328003535865637, "grad_norm": 1.581655502319336, "learning_rate": 1.691737773262569e-05, "loss": 0.9725, "step": 2002 }, { "epoch": 0.8332163377791644, "grad_norm": 1.4934488534927368, "learning_rate": 1.691414158882785e-05, "loss": 0.879, "step": 2003 }, { "epoch": 0.8336323219717651, "grad_norm": 1.6215416193008423, "learning_rate": 1.691090405714274e-05, "loss": 0.9667, "step": 2004 }, { "epoch": 0.8340483061643658, "grad_norm": 1.6121474504470825, "learning_rate": 1.6907665138220234e-05, "loss": 0.8391, "step": 2005 }, { "epoch": 0.8344642903569665, "grad_norm": 1.4677895307540894, "learning_rate": 1.690442483271049e-05, "loss": 0.8394, "step": 2006 }, { "epoch": 0.8348802745495671, "grad_norm": 1.5443267822265625, "learning_rate": 1.6901183141263934e-05, "loss": 0.9166, "step": 2007 }, { "epoch": 0.8352962587421678, "grad_norm": 1.5472077131271362, "learning_rate": 1.6897940064531285e-05, "loss": 0.8506, "step": 2008 }, { "epoch": 0.8357122429347684, "grad_norm": 1.5326614379882812, "learning_rate": 1.689469560316352e-05, "loss": 0.9069, "step": 2009 }, { "epoch": 0.8361282271273691, "grad_norm": 1.574164867401123, "learning_rate": 1.6891449757811917e-05, "loss": 0.9674, "step": 2010 }, { "epoch": 0.8365442113199698, "grad_norm": 1.5062367916107178, "learning_rate": 1.6888202529128012e-05, "loss": 0.8704, "step": 2011 }, { "epoch": 0.8369601955125705, "grad_norm": 634.3465576171875, "learning_rate": 1.6884953917763627e-05, "loss": 1.0199, "step": 2012 }, { "epoch": 0.8373761797051712, "grad_norm": 351.34735107421875, "learning_rate": 1.6881703924370863e-05, "loss": 0.8927, "step": 2013 }, { "epoch": 0.8377921638977719, "grad_norm": 1.5500701665878296, "learning_rate": 1.68784525496021e-05, "loss": 0.9261, "step": 2014 }, { "epoch": 0.8382081480903726, "grad_norm": 1.418350338935852, "learning_rate": 1.6875199794109988e-05, "loss": 0.8691, "step": 2015 }, { "epoch": 0.8386241322829733, "grad_norm": 1.4383556842803955, "learning_rate": 1.6871945658547456e-05, "loss": 0.9078, "step": 2016 }, { "epoch": 0.839040116475574, "grad_norm": 2.1833817958831787, "learning_rate": 1.6868690143567717e-05, "loss": 0.9195, "step": 2017 }, { "epoch": 0.8394561006681746, "grad_norm": 1.3997349739074707, "learning_rate": 1.686543324982425e-05, "loss": 0.8897, "step": 2018 }, { "epoch": 0.8398720848607752, "grad_norm": 1.7388700246810913, "learning_rate": 1.6862174977970825e-05, "loss": 0.7932, "step": 2019 }, { "epoch": 0.8402880690533759, "grad_norm": 1.6192023754119873, "learning_rate": 1.6858915328661475e-05, "loss": 0.9402, "step": 2020 }, { "epoch": 0.8407040532459766, "grad_norm": 1.4289982318878174, "learning_rate": 1.6855654302550512e-05, "loss": 0.842, "step": 2021 }, { "epoch": 0.8411200374385773, "grad_norm": 6.414224147796631, "learning_rate": 1.6852391900292536e-05, "loss": 1.0426, "step": 2022 }, { "epoch": 0.841536021631178, "grad_norm": 1.6936900615692139, "learning_rate": 1.6849128122542406e-05, "loss": 0.9662, "step": 2023 }, { "epoch": 0.8419520058237787, "grad_norm": 1.6680830717086792, "learning_rate": 1.684586296995527e-05, "loss": 0.9677, "step": 2024 }, { "epoch": 0.8423679900163794, "grad_norm": 1.7168493270874023, "learning_rate": 1.6842596443186547e-05, "loss": 0.7539, "step": 2025 }, { "epoch": 0.8427839742089801, "grad_norm": 1.520333170890808, "learning_rate": 1.6839328542891934e-05, "loss": 0.7771, "step": 2026 }, { "epoch": 0.8431999584015807, "grad_norm": 1.556056022644043, "learning_rate": 1.6836059269727397e-05, "loss": 0.7955, "step": 2027 }, { "epoch": 0.8436159425941814, "grad_norm": 1.5275055170059204, "learning_rate": 1.6832788624349188e-05, "loss": 0.7508, "step": 2028 }, { "epoch": 0.8440319267867821, "grad_norm": 1.5642342567443848, "learning_rate": 1.6829516607413824e-05, "loss": 0.8588, "step": 2029 }, { "epoch": 0.8444479109793828, "grad_norm": 1.421180009841919, "learning_rate": 1.682624321957811e-05, "loss": 0.845, "step": 2030 }, { "epoch": 0.8448638951719835, "grad_norm": 1.635593056678772, "learning_rate": 1.6822968461499112e-05, "loss": 0.8695, "step": 2031 }, { "epoch": 0.8452798793645842, "grad_norm": 1.6657845973968506, "learning_rate": 1.681969233383418e-05, "loss": 0.9985, "step": 2032 }, { "epoch": 0.8456958635571848, "grad_norm": 1.4534555673599243, "learning_rate": 1.6816414837240937e-05, "loss": 0.9363, "step": 2033 }, { "epoch": 0.8461118477497855, "grad_norm": 1.550568699836731, "learning_rate": 1.6813135972377278e-05, "loss": 0.9172, "step": 2034 }, { "epoch": 0.8465278319423862, "grad_norm": 1.462120532989502, "learning_rate": 1.680985573990138e-05, "loss": 0.9851, "step": 2035 }, { "epoch": 0.8469438161349868, "grad_norm": 1.4837359189987183, "learning_rate": 1.6806574140471685e-05, "loss": 0.8756, "step": 2036 }, { "epoch": 0.8473598003275875, "grad_norm": 1.5369192361831665, "learning_rate": 1.6803291174746913e-05, "loss": 0.8784, "step": 2037 }, { "epoch": 0.8477757845201882, "grad_norm": 1.8044472932815552, "learning_rate": 1.6800006843386062e-05, "loss": 0.9193, "step": 2038 }, { "epoch": 0.8481917687127889, "grad_norm": 1.5954601764678955, "learning_rate": 1.6796721147048405e-05, "loss": 0.958, "step": 2039 }, { "epoch": 0.8486077529053896, "grad_norm": 1.4968820810317993, "learning_rate": 1.679343408639348e-05, "loss": 0.8124, "step": 2040 }, { "epoch": 0.8490237370979903, "grad_norm": 1.5393061637878418, "learning_rate": 1.6790145662081104e-05, "loss": 0.8716, "step": 2041 }, { "epoch": 0.849439721290591, "grad_norm": 1.881392240524292, "learning_rate": 1.678685587477137e-05, "loss": 0.8413, "step": 2042 }, { "epoch": 0.8498557054831917, "grad_norm": 1.4574222564697266, "learning_rate": 1.6783564725124643e-05, "loss": 0.7768, "step": 2043 }, { "epoch": 0.8502716896757924, "grad_norm": 1.5757840871810913, "learning_rate": 1.6780272213801564e-05, "loss": 0.8738, "step": 2044 }, { "epoch": 0.850687673868393, "grad_norm": 5.916356086730957, "learning_rate": 1.6776978341463035e-05, "loss": 1.0271, "step": 2045 }, { "epoch": 0.8511036580609936, "grad_norm": 1.8420617580413818, "learning_rate": 1.6773683108770247e-05, "loss": 0.8857, "step": 2046 }, { "epoch": 0.8515196422535943, "grad_norm": 1.5861064195632935, "learning_rate": 1.6770386516384657e-05, "loss": 1.0653, "step": 2047 }, { "epoch": 0.851935626446195, "grad_norm": 1.5188353061676025, "learning_rate": 1.6767088564967996e-05, "loss": 0.8661, "step": 2048 }, { "epoch": 0.8523516106387957, "grad_norm": 1.6085654497146606, "learning_rate": 1.6763789255182262e-05, "loss": 0.9173, "step": 2049 }, { "epoch": 0.8527675948313964, "grad_norm": 1.6588011980056763, "learning_rate": 1.676048858768974e-05, "loss": 0.9005, "step": 2050 }, { "epoch": 0.8531835790239971, "grad_norm": 1.5267165899276733, "learning_rate": 1.6757186563152977e-05, "loss": 0.9207, "step": 2051 }, { "epoch": 0.8535995632165978, "grad_norm": 1.5601705312728882, "learning_rate": 1.6753883182234787e-05, "loss": 0.8537, "step": 2052 }, { "epoch": 0.8540155474091985, "grad_norm": 1.6191911697387695, "learning_rate": 1.6750578445598265e-05, "loss": 0.9646, "step": 2053 }, { "epoch": 0.8544315316017991, "grad_norm": 1.5675212144851685, "learning_rate": 1.674727235390678e-05, "loss": 0.8805, "step": 2054 }, { "epoch": 0.8548475157943998, "grad_norm": 1.5070805549621582, "learning_rate": 1.674396490782397e-05, "loss": 0.7886, "step": 2055 }, { "epoch": 0.8552634999870005, "grad_norm": 1.4037668704986572, "learning_rate": 1.6740656108013743e-05, "loss": 0.8243, "step": 2056 }, { "epoch": 0.8556794841796012, "grad_norm": 1.5846819877624512, "learning_rate": 1.6737345955140277e-05, "loss": 0.9337, "step": 2057 }, { "epoch": 0.8560954683722019, "grad_norm": 1.5896806716918945, "learning_rate": 1.6734034449868023e-05, "loss": 0.9059, "step": 2058 }, { "epoch": 0.8565114525648025, "grad_norm": 1.6200850009918213, "learning_rate": 1.6730721592861714e-05, "loss": 0.9132, "step": 2059 }, { "epoch": 0.8569274367574032, "grad_norm": 1.5756016969680786, "learning_rate": 1.6727407384786337e-05, "loss": 1.011, "step": 2060 }, { "epoch": 0.8573434209500039, "grad_norm": 1.4192752838134766, "learning_rate": 1.6724091826307162e-05, "loss": 0.9641, "step": 2061 }, { "epoch": 0.8577594051426046, "grad_norm": 1.5375645160675049, "learning_rate": 1.6720774918089728e-05, "loss": 0.8338, "step": 2062 }, { "epoch": 0.8581753893352052, "grad_norm": 1.418548583984375, "learning_rate": 1.6717456660799842e-05, "loss": 0.8932, "step": 2063 }, { "epoch": 0.8585913735278059, "grad_norm": 1.4990166425704956, "learning_rate": 1.671413705510358e-05, "loss": 0.7965, "step": 2064 }, { "epoch": 0.8590073577204066, "grad_norm": 1.537630319595337, "learning_rate": 1.6710816101667298e-05, "loss": 0.7814, "step": 2065 }, { "epoch": 0.8594233419130073, "grad_norm": 1.4861730337142944, "learning_rate": 1.6707493801157612e-05, "loss": 0.8572, "step": 2066 }, { "epoch": 0.859839326105608, "grad_norm": 1.63003408908844, "learning_rate": 1.6704170154241413e-05, "loss": 0.892, "step": 2067 }, { "epoch": 0.8602553102982087, "grad_norm": 1.4280873537063599, "learning_rate": 1.6700845161585867e-05, "loss": 0.9523, "step": 2068 }, { "epoch": 0.8606712944908094, "grad_norm": 25.207435607910156, "learning_rate": 1.66975188238584e-05, "loss": 0.8666, "step": 2069 }, { "epoch": 0.8610872786834101, "grad_norm": 1.600455403327942, "learning_rate": 1.6694191141726714e-05, "loss": 0.8362, "step": 2070 }, { "epoch": 0.8615032628760108, "grad_norm": 5.41581392288208, "learning_rate": 1.669086211585879e-05, "loss": 0.9583, "step": 2071 }, { "epoch": 0.8619192470686113, "grad_norm": 1.5501415729522705, "learning_rate": 1.6687531746922852e-05, "loss": 0.8926, "step": 2072 }, { "epoch": 0.862335231261212, "grad_norm": 1.5803736448287964, "learning_rate": 1.668420003558742e-05, "loss": 0.8751, "step": 2073 }, { "epoch": 0.8627512154538127, "grad_norm": 1.6117701530456543, "learning_rate": 1.668086698252128e-05, "loss": 0.875, "step": 2074 }, { "epoch": 0.8631671996464134, "grad_norm": 1.5530356168746948, "learning_rate": 1.6677532588393468e-05, "loss": 0.9334, "step": 2075 }, { "epoch": 0.8635831838390141, "grad_norm": 35.99604034423828, "learning_rate": 1.6674196853873314e-05, "loss": 0.7617, "step": 2076 }, { "epoch": 0.8639991680316148, "grad_norm": 1.5911489725112915, "learning_rate": 1.6670859779630396e-05, "loss": 0.9116, "step": 2077 }, { "epoch": 0.8644151522242155, "grad_norm": 1.597745656967163, "learning_rate": 1.666752136633458e-05, "loss": 0.9023, "step": 2078 }, { "epoch": 0.8648311364168162, "grad_norm": 1.5455496311187744, "learning_rate": 1.6664181614655984e-05, "loss": 0.9563, "step": 2079 }, { "epoch": 0.8652471206094169, "grad_norm": 1.5417969226837158, "learning_rate": 1.6660840525265006e-05, "loss": 0.9205, "step": 2080 }, { "epoch": 0.8656631048020175, "grad_norm": 1.5631822347640991, "learning_rate": 1.6657498098832308e-05, "loss": 1.0415, "step": 2081 }, { "epoch": 0.8660790889946182, "grad_norm": 1.5701277256011963, "learning_rate": 1.665415433602882e-05, "loss": 0.948, "step": 2082 }, { "epoch": 0.8664950731872189, "grad_norm": 1.601543664932251, "learning_rate": 1.665080923752574e-05, "loss": 0.9022, "step": 2083 }, { "epoch": 0.8669110573798195, "grad_norm": 1.6437900066375732, "learning_rate": 1.6647462803994535e-05, "loss": 0.8879, "step": 2084 }, { "epoch": 0.8673270415724202, "grad_norm": 1.560076117515564, "learning_rate": 1.6644115036106944e-05, "loss": 0.9087, "step": 2085 }, { "epoch": 0.8677430257650209, "grad_norm": 1.4569272994995117, "learning_rate": 1.6640765934534967e-05, "loss": 0.8239, "step": 2086 }, { "epoch": 0.8681590099576216, "grad_norm": 1.6263010501861572, "learning_rate": 1.6637415499950872e-05, "loss": 1.0384, "step": 2087 }, { "epoch": 0.8685749941502223, "grad_norm": 1.612240195274353, "learning_rate": 1.6634063733027204e-05, "loss": 0.9492, "step": 2088 }, { "epoch": 0.868990978342823, "grad_norm": 1.5481373071670532, "learning_rate": 1.663071063443677e-05, "loss": 0.9005, "step": 2089 }, { "epoch": 0.8694069625354236, "grad_norm": 1.8182076215744019, "learning_rate": 1.6627356204852634e-05, "loss": 0.906, "step": 2090 }, { "epoch": 0.8698229467280243, "grad_norm": 1.5672688484191895, "learning_rate": 1.662400044494814e-05, "loss": 0.8673, "step": 2091 }, { "epoch": 0.870238930920625, "grad_norm": 1.4918736219406128, "learning_rate": 1.66206433553969e-05, "loss": 0.8241, "step": 2092 }, { "epoch": 0.8706549151132257, "grad_norm": 1.5464643239974976, "learning_rate": 1.6617284936872782e-05, "loss": 0.89, "step": 2093 }, { "epoch": 0.8710708993058264, "grad_norm": 1.6525861024856567, "learning_rate": 1.6613925190049933e-05, "loss": 0.9368, "step": 2094 }, { "epoch": 0.8714868834984271, "grad_norm": 1.4834895133972168, "learning_rate": 1.6610564115602756e-05, "loss": 0.7861, "step": 2095 }, { "epoch": 0.8719028676910278, "grad_norm": 6.402346611022949, "learning_rate": 1.6607201714205925e-05, "loss": 0.8981, "step": 2096 }, { "epoch": 0.8723188518836285, "grad_norm": 1.503373622894287, "learning_rate": 1.660383798653438e-05, "loss": 0.7805, "step": 2097 }, { "epoch": 0.8727348360762291, "grad_norm": 1929.2039794921875, "learning_rate": 1.6600472933263335e-05, "loss": 0.8239, "step": 2098 }, { "epoch": 0.8731508202688297, "grad_norm": 1.6232843399047852, "learning_rate": 1.6597106555068253e-05, "loss": 0.9883, "step": 2099 }, { "epoch": 0.8735668044614304, "grad_norm": 1.560271978378296, "learning_rate": 1.659373885262488e-05, "loss": 0.9064, "step": 2100 }, { "epoch": 0.8739827886540311, "grad_norm": 1.621713399887085, "learning_rate": 1.6590369826609216e-05, "loss": 0.8868, "step": 2101 }, { "epoch": 0.8743987728466318, "grad_norm": 1.5852829217910767, "learning_rate": 1.6586999477697535e-05, "loss": 0.9688, "step": 2102 }, { "epoch": 0.8748147570392325, "grad_norm": 1.5740257501602173, "learning_rate": 1.6583627806566367e-05, "loss": 0.8642, "step": 2103 }, { "epoch": 0.8752307412318332, "grad_norm": 1.4974952936172485, "learning_rate": 1.658025481389252e-05, "loss": 0.7475, "step": 2104 }, { "epoch": 0.8756467254244339, "grad_norm": 1.5410383939743042, "learning_rate": 1.6576880500353057e-05, "loss": 1.0105, "step": 2105 }, { "epoch": 0.8760627096170346, "grad_norm": 1.4259604215621948, "learning_rate": 1.657350486662531e-05, "loss": 0.7638, "step": 2106 }, { "epoch": 0.8764786938096353, "grad_norm": 1.5991411209106445, "learning_rate": 1.657012791338688e-05, "loss": 0.9244, "step": 2107 }, { "epoch": 0.8768946780022359, "grad_norm": 2.2097246646881104, "learning_rate": 1.656674964131562e-05, "loss": 1.0566, "step": 2108 }, { "epoch": 0.8773106621948366, "grad_norm": 1.4972797632217407, "learning_rate": 1.656337005108966e-05, "loss": 0.9472, "step": 2109 }, { "epoch": 0.8777266463874372, "grad_norm": 1.509124994277954, "learning_rate": 1.655998914338739e-05, "loss": 0.8776, "step": 2110 }, { "epoch": 0.8781426305800379, "grad_norm": 1.408642053604126, "learning_rate": 1.655660691888747e-05, "loss": 0.8143, "step": 2111 }, { "epoch": 0.8785586147726386, "grad_norm": 1.5147969722747803, "learning_rate": 1.6553223378268816e-05, "loss": 0.8773, "step": 2112 }, { "epoch": 0.8789745989652393, "grad_norm": 1.5136743783950806, "learning_rate": 1.654983852221061e-05, "loss": 0.8715, "step": 2113 }, { "epoch": 0.87939058315784, "grad_norm": 1.547041654586792, "learning_rate": 1.6546452351392303e-05, "loss": 0.9186, "step": 2114 }, { "epoch": 0.8798065673504407, "grad_norm": 1.4598801136016846, "learning_rate": 1.6543064866493606e-05, "loss": 0.8457, "step": 2115 }, { "epoch": 0.8802225515430414, "grad_norm": 1.457642674446106, "learning_rate": 1.6539676068194495e-05, "loss": 0.849, "step": 2116 }, { "epoch": 0.880638535735642, "grad_norm": 1.5287079811096191, "learning_rate": 1.6536285957175208e-05, "loss": 0.7613, "step": 2117 }, { "epoch": 0.8810545199282427, "grad_norm": 1.4503499269485474, "learning_rate": 1.6532894534116248e-05, "loss": 0.9128, "step": 2118 }, { "epoch": 0.8814705041208434, "grad_norm": 1.6226381063461304, "learning_rate": 1.652950179969838e-05, "loss": 0.9369, "step": 2119 }, { "epoch": 0.8818864883134441, "grad_norm": 1.6245081424713135, "learning_rate": 1.6526107754602635e-05, "loss": 0.8913, "step": 2120 }, { "epoch": 0.8823024725060448, "grad_norm": 1.5570168495178223, "learning_rate": 1.6522712399510306e-05, "loss": 0.855, "step": 2121 }, { "epoch": 0.8827184566986455, "grad_norm": 1.5707958936691284, "learning_rate": 1.6519315735102948e-05, "loss": 0.8531, "step": 2122 }, { "epoch": 0.8831344408912462, "grad_norm": 1.5048786401748657, "learning_rate": 1.6515917762062375e-05, "loss": 0.8413, "step": 2123 }, { "epoch": 0.8835504250838468, "grad_norm": 1.5914850234985352, "learning_rate": 1.6512518481070673e-05, "loss": 0.8201, "step": 2124 }, { "epoch": 0.8839664092764475, "grad_norm": 1.4824531078338623, "learning_rate": 1.6509117892810186e-05, "loss": 0.9353, "step": 2125 }, { "epoch": 0.8843823934690481, "grad_norm": 1.849518895149231, "learning_rate": 1.6505715997963513e-05, "loss": 0.928, "step": 2126 }, { "epoch": 0.8847983776616488, "grad_norm": 1.5594404935836792, "learning_rate": 1.6502312797213528e-05, "loss": 0.9036, "step": 2127 }, { "epoch": 0.8852143618542495, "grad_norm": 1.6238490343093872, "learning_rate": 1.6498908291243362e-05, "loss": 0.89, "step": 2128 }, { "epoch": 0.8856303460468502, "grad_norm": 1.5721735954284668, "learning_rate": 1.64955024807364e-05, "loss": 0.8363, "step": 2129 }, { "epoch": 0.8860463302394509, "grad_norm": 1.3958579301834106, "learning_rate": 1.6492095366376302e-05, "loss": 0.8896, "step": 2130 }, { "epoch": 0.8864623144320516, "grad_norm": 1.619240403175354, "learning_rate": 1.6488686948846987e-05, "loss": 1.0039, "step": 2131 }, { "epoch": 0.8868782986246523, "grad_norm": 1.6204193830490112, "learning_rate": 1.6485277228832622e-05, "loss": 0.8863, "step": 2132 }, { "epoch": 0.887294282817253, "grad_norm": 1.5426427125930786, "learning_rate": 1.6481866207017657e-05, "loss": 0.8933, "step": 2133 }, { "epoch": 0.8877102670098537, "grad_norm": 1769.8446044921875, "learning_rate": 1.6478453884086786e-05, "loss": 0.8973, "step": 2134 }, { "epoch": 0.8881262512024543, "grad_norm": 1.5223695039749146, "learning_rate": 1.647504026072497e-05, "loss": 0.9248, "step": 2135 }, { "epoch": 0.888542235395055, "grad_norm": 1.6263031959533691, "learning_rate": 1.6471625337617438e-05, "loss": 0.9081, "step": 2136 }, { "epoch": 0.8889582195876556, "grad_norm": 1.6107938289642334, "learning_rate": 1.646820911544966e-05, "loss": 0.8596, "step": 2137 }, { "epoch": 0.8893742037802563, "grad_norm": 1.6955441236495972, "learning_rate": 1.646479159490739e-05, "loss": 0.8937, "step": 2138 }, { "epoch": 0.889790187972857, "grad_norm": 1.6516801118850708, "learning_rate": 1.6461372776676637e-05, "loss": 0.8555, "step": 2139 }, { "epoch": 0.8902061721654577, "grad_norm": 1.4953206777572632, "learning_rate": 1.6457952661443654e-05, "loss": 0.8087, "step": 2140 }, { "epoch": 0.8906221563580584, "grad_norm": 5247.5458984375, "learning_rate": 1.6454531249894975e-05, "loss": 0.8613, "step": 2141 }, { "epoch": 0.8910381405506591, "grad_norm": 1.6125950813293457, "learning_rate": 1.645110854271738e-05, "loss": 0.9592, "step": 2142 }, { "epoch": 0.8914541247432598, "grad_norm": 1.671553134918213, "learning_rate": 1.644768454059792e-05, "loss": 0.9658, "step": 2143 }, { "epoch": 0.8918701089358604, "grad_norm": 33.162559509277344, "learning_rate": 1.6444259244223897e-05, "loss": 0.8849, "step": 2144 }, { "epoch": 0.8922860931284611, "grad_norm": 1.5630083084106445, "learning_rate": 1.6440832654282876e-05, "loss": 0.8733, "step": 2145 }, { "epoch": 0.8927020773210618, "grad_norm": 1.5766807794570923, "learning_rate": 1.6437404771462688e-05, "loss": 0.8172, "step": 2146 }, { "epoch": 0.8931180615136625, "grad_norm": 1.5963863134384155, "learning_rate": 1.6433975596451413e-05, "loss": 0.7825, "step": 2147 }, { "epoch": 0.8935340457062632, "grad_norm": 1.5000897645950317, "learning_rate": 1.6430545129937393e-05, "loss": 0.8925, "step": 2148 }, { "epoch": 0.8939500298988639, "grad_norm": 1.5055453777313232, "learning_rate": 1.642711337260924e-05, "loss": 0.8616, "step": 2149 }, { "epoch": 0.8943660140914645, "grad_norm": 1.562251091003418, "learning_rate": 1.6423680325155808e-05, "loss": 1.0533, "step": 2150 }, { "epoch": 0.8947819982840652, "grad_norm": 1.708051323890686, "learning_rate": 1.642024598826622e-05, "loss": 0.8788, "step": 2151 }, { "epoch": 0.8951979824766659, "grad_norm": 1.523444652557373, "learning_rate": 1.6416810362629857e-05, "loss": 0.852, "step": 2152 }, { "epoch": 0.8956139666692665, "grad_norm": 1.5533174276351929, "learning_rate": 1.641337344893636e-05, "loss": 0.8516, "step": 2153 }, { "epoch": 0.8960299508618672, "grad_norm": 1.564210295677185, "learning_rate": 1.6409935247875623e-05, "loss": 0.9571, "step": 2154 }, { "epoch": 0.8964459350544679, "grad_norm": 1.5257575511932373, "learning_rate": 1.64064957601378e-05, "loss": 0.914, "step": 2155 }, { "epoch": 0.8968619192470686, "grad_norm": 1.6088191270828247, "learning_rate": 1.6403054986413316e-05, "loss": 0.8658, "step": 2156 }, { "epoch": 0.8972779034396693, "grad_norm": 5.461593151092529, "learning_rate": 1.639961292739283e-05, "loss": 0.8465, "step": 2157 }, { "epoch": 0.89769388763227, "grad_norm": 1.5528464317321777, "learning_rate": 1.639616958376728e-05, "loss": 0.8019, "step": 2158 }, { "epoch": 0.8981098718248707, "grad_norm": 1.4441869258880615, "learning_rate": 1.6392724956227856e-05, "loss": 0.8831, "step": 2159 }, { "epoch": 0.8985258560174714, "grad_norm": 1.5878814458847046, "learning_rate": 1.6389279045465993e-05, "loss": 0.8775, "step": 2160 }, { "epoch": 0.8989418402100721, "grad_norm": 1.5623505115509033, "learning_rate": 1.6385831852173407e-05, "loss": 0.867, "step": 2161 }, { "epoch": 0.8993578244026726, "grad_norm": 1.6057255268096924, "learning_rate": 1.6382383377042054e-05, "loss": 0.9195, "step": 2162 }, { "epoch": 0.8997738085952733, "grad_norm": 1.4842413663864136, "learning_rate": 1.637893362076415e-05, "loss": 0.9489, "step": 2163 }, { "epoch": 0.900189792787874, "grad_norm": 1.4700374603271484, "learning_rate": 1.637548258403217e-05, "loss": 0.9231, "step": 2164 }, { "epoch": 0.9006057769804747, "grad_norm": 1.6222095489501953, "learning_rate": 1.6372030267538853e-05, "loss": 0.9169, "step": 2165 }, { "epoch": 0.9010217611730754, "grad_norm": 1.572607159614563, "learning_rate": 1.6368576671977182e-05, "loss": 0.8318, "step": 2166 }, { "epoch": 0.9014377453656761, "grad_norm": 154.59788513183594, "learning_rate": 1.6365121798040407e-05, "loss": 0.8916, "step": 2167 }, { "epoch": 0.9018537295582768, "grad_norm": 1.5972256660461426, "learning_rate": 1.6361665646422027e-05, "loss": 0.941, "step": 2168 }, { "epoch": 0.9022697137508775, "grad_norm": 1.6189945936203003, "learning_rate": 1.63582082178158e-05, "loss": 0.9635, "step": 2169 }, { "epoch": 0.9026856979434782, "grad_norm": 1.4191770553588867, "learning_rate": 1.6354749512915748e-05, "loss": 0.8515, "step": 2170 }, { "epoch": 0.9031016821360788, "grad_norm": 1.6670681238174438, "learning_rate": 1.6351289532416136e-05, "loss": 0.89, "step": 2171 }, { "epoch": 0.9035176663286795, "grad_norm": 1.5708303451538086, "learning_rate": 1.6347828277011496e-05, "loss": 0.8898, "step": 2172 }, { "epoch": 0.9039336505212802, "grad_norm": 1.6540530920028687, "learning_rate": 1.6344365747396614e-05, "loss": 0.8316, "step": 2173 }, { "epoch": 0.9043496347138809, "grad_norm": 1.57146418094635, "learning_rate": 1.634090194426652e-05, "loss": 0.9927, "step": 2174 }, { "epoch": 0.9047656189064816, "grad_norm": 1.5645211935043335, "learning_rate": 1.6337436868316516e-05, "loss": 0.9278, "step": 2175 }, { "epoch": 0.9051816030990822, "grad_norm": 1.3176783323287964, "learning_rate": 1.6333970520242152e-05, "loss": 0.7232, "step": 2176 }, { "epoch": 0.9055975872916829, "grad_norm": 1.4123308658599854, "learning_rate": 1.633050290073923e-05, "loss": 0.8059, "step": 2177 }, { "epoch": 0.9060135714842836, "grad_norm": 1.4911174774169922, "learning_rate": 1.632703401050382e-05, "loss": 0.8799, "step": 2178 }, { "epoch": 0.9064295556768843, "grad_norm": 1.4548330307006836, "learning_rate": 1.6323563850232227e-05, "loss": 0.8089, "step": 2179 }, { "epoch": 0.9068455398694849, "grad_norm": 1.6467912197113037, "learning_rate": 1.632009242062103e-05, "loss": 0.9703, "step": 2180 }, { "epoch": 0.9072615240620856, "grad_norm": 427.07635498046875, "learning_rate": 1.6316619722367052e-05, "loss": 0.864, "step": 2181 }, { "epoch": 0.9076775082546863, "grad_norm": 1.57991361618042, "learning_rate": 1.6313145756167374e-05, "loss": 0.8734, "step": 2182 }, { "epoch": 0.908093492447287, "grad_norm": 1.5922528505325317, "learning_rate": 1.630967052271933e-05, "loss": 0.7818, "step": 2183 }, { "epoch": 0.9085094766398877, "grad_norm": 1.650792121887207, "learning_rate": 1.630619402272051e-05, "loss": 0.9051, "step": 2184 }, { "epoch": 0.9089254608324884, "grad_norm": 1.6233364343643188, "learning_rate": 1.630271625686876e-05, "loss": 0.8345, "step": 2185 }, { "epoch": 0.9093414450250891, "grad_norm": 1.5893033742904663, "learning_rate": 1.6299237225862178e-05, "loss": 0.9075, "step": 2186 }, { "epoch": 0.9097574292176898, "grad_norm": 1.5121182203292847, "learning_rate": 1.629575693039911e-05, "loss": 0.8755, "step": 2187 }, { "epoch": 0.9101734134102905, "grad_norm": 1.4394782781600952, "learning_rate": 1.6292275371178164e-05, "loss": 0.8738, "step": 2188 }, { "epoch": 0.910589397602891, "grad_norm": 1.593707799911499, "learning_rate": 1.62887925488982e-05, "loss": 0.8404, "step": 2189 }, { "epoch": 0.9110053817954917, "grad_norm": 1.5706150531768799, "learning_rate": 1.6285308464258335e-05, "loss": 0.8753, "step": 2190 }, { "epoch": 0.9114213659880924, "grad_norm": 6.490848064422607, "learning_rate": 1.628182311795793e-05, "loss": 0.9046, "step": 2191 }, { "epoch": 0.9118373501806931, "grad_norm": 1.4167453050613403, "learning_rate": 1.6278336510696603e-05, "loss": 0.793, "step": 2192 }, { "epoch": 0.9122533343732938, "grad_norm": 360.8721923828125, "learning_rate": 1.6274848643174235e-05, "loss": 0.8592, "step": 2193 }, { "epoch": 0.9126693185658945, "grad_norm": 1.5214552879333496, "learning_rate": 1.6271359516090943e-05, "loss": 0.8472, "step": 2194 }, { "epoch": 0.9130853027584952, "grad_norm": 1.5986747741699219, "learning_rate": 1.6267869130147106e-05, "loss": 0.8431, "step": 2195 }, { "epoch": 0.9135012869510959, "grad_norm": 1.5306599140167236, "learning_rate": 1.6264377486043358e-05, "loss": 0.9249, "step": 2196 }, { "epoch": 0.9139172711436966, "grad_norm": 1.542023777961731, "learning_rate": 1.6260884584480586e-05, "loss": 1.024, "step": 2197 }, { "epoch": 0.9143332553362972, "grad_norm": 1.5669190883636475, "learning_rate": 1.6257390426159915e-05, "loss": 0.8429, "step": 2198 }, { "epoch": 0.9147492395288979, "grad_norm": 14.942290306091309, "learning_rate": 1.6253895011782747e-05, "loss": 0.8864, "step": 2199 }, { "epoch": 0.9151652237214986, "grad_norm": 1.5152599811553955, "learning_rate": 1.625039834205071e-05, "loss": 0.7795, "step": 2200 }, { "epoch": 0.9155812079140992, "grad_norm": 1.5161445140838623, "learning_rate": 1.6246900417665704e-05, "loss": 0.8158, "step": 2201 }, { "epoch": 0.9159971921066999, "grad_norm": 1.659722089767456, "learning_rate": 1.624340123932987e-05, "loss": 0.8458, "step": 2202 }, { "epoch": 0.9164131762993006, "grad_norm": 1.536039113998413, "learning_rate": 1.623990080774561e-05, "loss": 0.9145, "step": 2203 }, { "epoch": 0.9168291604919013, "grad_norm": 1.5883564949035645, "learning_rate": 1.623639912361556e-05, "loss": 0.863, "step": 2204 }, { "epoch": 0.917245144684502, "grad_norm": 3.2609903812408447, "learning_rate": 1.623289618764263e-05, "loss": 0.945, "step": 2205 }, { "epoch": 0.9176611288771027, "grad_norm": 1.7088640928268433, "learning_rate": 1.6229392000529964e-05, "loss": 0.8521, "step": 2206 }, { "epoch": 0.9180771130697033, "grad_norm": 46.03165817260742, "learning_rate": 1.6225886562980965e-05, "loss": 0.9441, "step": 2207 }, { "epoch": 0.918493097262304, "grad_norm": 1.4754921197891235, "learning_rate": 1.6222379875699285e-05, "loss": 0.8206, "step": 2208 }, { "epoch": 0.9189090814549047, "grad_norm": 2.1490285396575928, "learning_rate": 1.6218871939388828e-05, "loss": 0.9697, "step": 2209 }, { "epoch": 0.9193250656475054, "grad_norm": 1.4964264631271362, "learning_rate": 1.621536275475375e-05, "loss": 0.9277, "step": 2210 }, { "epoch": 0.9197410498401061, "grad_norm": 1.5712428092956543, "learning_rate": 1.6211852322498452e-05, "loss": 0.8397, "step": 2211 }, { "epoch": 0.9201570340327068, "grad_norm": 1.5563093423843384, "learning_rate": 1.620834064332759e-05, "loss": 0.7998, "step": 2212 }, { "epoch": 0.9205730182253075, "grad_norm": 1.4136992692947388, "learning_rate": 1.620482771794607e-05, "loss": 0.846, "step": 2213 }, { "epoch": 0.9209890024179082, "grad_norm": 1.5119798183441162, "learning_rate": 1.6201313547059047e-05, "loss": 0.8343, "step": 2214 }, { "epoch": 0.9214049866105088, "grad_norm": 1.4453928470611572, "learning_rate": 1.6197798131371924e-05, "loss": 0.7596, "step": 2215 }, { "epoch": 0.9218209708031094, "grad_norm": 1.5641636848449707, "learning_rate": 1.6194281471590357e-05, "loss": 0.9157, "step": 2216 }, { "epoch": 0.9222369549957101, "grad_norm": 1.4371355772018433, "learning_rate": 1.619076356842026e-05, "loss": 0.7425, "step": 2217 }, { "epoch": 0.9226529391883108, "grad_norm": 1.6394662857055664, "learning_rate": 1.6187244422567777e-05, "loss": 0.8332, "step": 2218 }, { "epoch": 0.9230689233809115, "grad_norm": 1.5383915901184082, "learning_rate": 1.6183724034739313e-05, "loss": 0.836, "step": 2219 }, { "epoch": 0.9234849075735122, "grad_norm": 1.4674988985061646, "learning_rate": 1.6180202405641527e-05, "loss": 0.8764, "step": 2220 }, { "epoch": 0.9239008917661129, "grad_norm": 1.5196350812911987, "learning_rate": 1.6176679535981314e-05, "loss": 0.7872, "step": 2221 }, { "epoch": 0.9243168759587136, "grad_norm": 1.746071219444275, "learning_rate": 1.6173155426465835e-05, "loss": 0.9105, "step": 2222 }, { "epoch": 0.9247328601513143, "grad_norm": 158.7972869873047, "learning_rate": 1.6169630077802477e-05, "loss": 0.9156, "step": 2223 }, { "epoch": 0.925148844343915, "grad_norm": 1.5687627792358398, "learning_rate": 1.6166103490698905e-05, "loss": 0.807, "step": 2224 }, { "epoch": 0.9255648285365156, "grad_norm": 1.6306133270263672, "learning_rate": 1.6162575665863006e-05, "loss": 0.9387, "step": 2225 }, { "epoch": 0.9259808127291163, "grad_norm": 1.5787273645401, "learning_rate": 1.6159046604002933e-05, "loss": 0.8709, "step": 2226 }, { "epoch": 0.926396796921717, "grad_norm": 1.6618863344192505, "learning_rate": 1.6155516305827075e-05, "loss": 0.8278, "step": 2227 }, { "epoch": 0.9268127811143176, "grad_norm": 1.5232945680618286, "learning_rate": 1.6151984772044077e-05, "loss": 0.8075, "step": 2228 }, { "epoch": 0.9272287653069183, "grad_norm": 1.6379691362380981, "learning_rate": 1.6148452003362828e-05, "loss": 1.0192, "step": 2229 }, { "epoch": 0.927644749499519, "grad_norm": 1.6434721946716309, "learning_rate": 1.614491800049247e-05, "loss": 0.8831, "step": 2230 }, { "epoch": 0.9280607336921197, "grad_norm": 1.5200517177581787, "learning_rate": 1.6141382764142383e-05, "loss": 0.884, "step": 2231 }, { "epoch": 0.9284767178847204, "grad_norm": 1.4980311393737793, "learning_rate": 1.613784629502221e-05, "loss": 0.8218, "step": 2232 }, { "epoch": 0.9288927020773211, "grad_norm": 1.5544646978378296, "learning_rate": 1.6134308593841824e-05, "loss": 0.9477, "step": 2233 }, { "epoch": 0.9293086862699217, "grad_norm": 1.5186374187469482, "learning_rate": 1.613076966131136e-05, "loss": 0.8271, "step": 2234 }, { "epoch": 0.9297246704625224, "grad_norm": 1.6095411777496338, "learning_rate": 1.6127229498141187e-05, "loss": 0.8871, "step": 2235 }, { "epoch": 0.9301406546551231, "grad_norm": 1.7046723365783691, "learning_rate": 1.6123688105041938e-05, "loss": 0.9254, "step": 2236 }, { "epoch": 0.9305566388477238, "grad_norm": 1.5088894367218018, "learning_rate": 1.612014548272447e-05, "loss": 0.9228, "step": 2237 }, { "epoch": 0.9309726230403245, "grad_norm": 1.4066978693008423, "learning_rate": 1.6116601631899915e-05, "loss": 0.8411, "step": 2238 }, { "epoch": 0.9313886072329252, "grad_norm": 1.5723040103912354, "learning_rate": 1.611305655327962e-05, "loss": 0.9232, "step": 2239 }, { "epoch": 0.9318045914255259, "grad_norm": 1.4966645240783691, "learning_rate": 1.6109510247575207e-05, "loss": 0.927, "step": 2240 }, { "epoch": 0.9322205756181265, "grad_norm": 1.6070870161056519, "learning_rate": 1.6105962715498523e-05, "loss": 0.9861, "step": 2241 }, { "epoch": 0.9326365598107272, "grad_norm": 97.22660827636719, "learning_rate": 1.6102413957761677e-05, "loss": 0.8356, "step": 2242 }, { "epoch": 0.9330525440033278, "grad_norm": 1.4950343370437622, "learning_rate": 1.6098863975077012e-05, "loss": 0.8871, "step": 2243 }, { "epoch": 0.9334685281959285, "grad_norm": 5.062091827392578, "learning_rate": 1.609531276815713e-05, "loss": 0.7843, "step": 2244 }, { "epoch": 0.9338845123885292, "grad_norm": 1.6245200634002686, "learning_rate": 1.6091760337714856e-05, "loss": 0.8701, "step": 2245 }, { "epoch": 0.9343004965811299, "grad_norm": 1.6561200618743896, "learning_rate": 1.608820668446329e-05, "loss": 0.8752, "step": 2246 }, { "epoch": 0.9347164807737306, "grad_norm": 1.4656604528427124, "learning_rate": 1.6084651809115756e-05, "loss": 0.872, "step": 2247 }, { "epoch": 0.9351324649663313, "grad_norm": 1.6722534894943237, "learning_rate": 1.608109571238583e-05, "loss": 0.9366, "step": 2248 }, { "epoch": 0.935548449158932, "grad_norm": 1.514764428138733, "learning_rate": 1.6077538394987334e-05, "loss": 0.8275, "step": 2249 }, { "epoch": 0.9359644333515327, "grad_norm": 1.3882094621658325, "learning_rate": 1.607397985763434e-05, "loss": 0.7651, "step": 2250 }, { "epoch": 0.9363804175441334, "grad_norm": 1.7279568910598755, "learning_rate": 1.607042010104115e-05, "loss": 0.9944, "step": 2251 }, { "epoch": 0.936796401736734, "grad_norm": 1.581791639328003, "learning_rate": 1.606685912592232e-05, "loss": 0.8469, "step": 2252 }, { "epoch": 0.9372123859293346, "grad_norm": 1.396004319190979, "learning_rate": 1.606329693299266e-05, "loss": 0.8407, "step": 2253 }, { "epoch": 0.9376283701219353, "grad_norm": 2.6737940311431885, "learning_rate": 1.605973352296721e-05, "loss": 0.844, "step": 2254 }, { "epoch": 0.938044354314536, "grad_norm": 1.6651716232299805, "learning_rate": 1.6056168896561257e-05, "loss": 0.8284, "step": 2255 }, { "epoch": 0.9384603385071367, "grad_norm": 1.5509660243988037, "learning_rate": 1.6052603054490335e-05, "loss": 0.8009, "step": 2256 }, { "epoch": 0.9388763226997374, "grad_norm": 1.5986061096191406, "learning_rate": 1.6049035997470224e-05, "loss": 0.8262, "step": 2257 }, { "epoch": 0.9392923068923381, "grad_norm": 61.29841995239258, "learning_rate": 1.6045467726216942e-05, "loss": 0.9153, "step": 2258 }, { "epoch": 0.9397082910849388, "grad_norm": 11.032609939575195, "learning_rate": 1.604189824144676e-05, "loss": 0.9139, "step": 2259 }, { "epoch": 0.9401242752775395, "grad_norm": 2.121447801589966, "learning_rate": 1.603832754387618e-05, "loss": 0.8633, "step": 2260 }, { "epoch": 0.9405402594701401, "grad_norm": 1.6197835206985474, "learning_rate": 1.6034755634221958e-05, "loss": 0.9106, "step": 2261 }, { "epoch": 0.9409562436627408, "grad_norm": 1.635467290878296, "learning_rate": 1.6031182513201085e-05, "loss": 1.0358, "step": 2262 }, { "epoch": 0.9413722278553415, "grad_norm": 1.7045766115188599, "learning_rate": 1.6027608181530806e-05, "loss": 0.892, "step": 2263 }, { "epoch": 0.9417882120479422, "grad_norm": 1.6637533903121948, "learning_rate": 1.60240326399286e-05, "loss": 0.978, "step": 2264 }, { "epoch": 0.9422041962405429, "grad_norm": 1.783701777458191, "learning_rate": 1.602045588911219e-05, "loss": 0.7495, "step": 2265 }, { "epoch": 0.9426201804331436, "grad_norm": 1.4523789882659912, "learning_rate": 1.6016877929799545e-05, "loss": 0.8293, "step": 2266 }, { "epoch": 0.9430361646257442, "grad_norm": 1.5890156030654907, "learning_rate": 1.601329876270887e-05, "loss": 0.897, "step": 2267 }, { "epoch": 0.9434521488183449, "grad_norm": 1.4761089086532593, "learning_rate": 1.6009718388558627e-05, "loss": 0.9228, "step": 2268 }, { "epoch": 0.9438681330109456, "grad_norm": 1.6185630559921265, "learning_rate": 1.6006136808067498e-05, "loss": 0.929, "step": 2269 }, { "epoch": 0.9442841172035462, "grad_norm": 1.6369470357894897, "learning_rate": 1.600255402195443e-05, "loss": 0.8641, "step": 2270 }, { "epoch": 0.9447001013961469, "grad_norm": 1.5036851167678833, "learning_rate": 1.59989700309386e-05, "loss": 0.9082, "step": 2271 }, { "epoch": 0.9451160855887476, "grad_norm": 1.640054702758789, "learning_rate": 1.5995384835739425e-05, "loss": 0.821, "step": 2272 }, { "epoch": 0.9455320697813483, "grad_norm": 1.4861875772476196, "learning_rate": 1.5991798437076573e-05, "loss": 0.8752, "step": 2273 }, { "epoch": 0.945948053973949, "grad_norm": 12.165118217468262, "learning_rate": 1.5988210835669934e-05, "loss": 0.8903, "step": 2274 }, { "epoch": 0.9463640381665497, "grad_norm": 1.5279316902160645, "learning_rate": 1.5984622032239673e-05, "loss": 0.8422, "step": 2275 }, { "epoch": 0.9467800223591504, "grad_norm": 1.5260120630264282, "learning_rate": 1.598103202750616e-05, "loss": 0.951, "step": 2276 }, { "epoch": 0.9471960065517511, "grad_norm": 1.5835227966308594, "learning_rate": 1.5977440822190033e-05, "loss": 0.8282, "step": 2277 }, { "epoch": 0.9476119907443518, "grad_norm": 1.8242067098617554, "learning_rate": 1.5973848417012156e-05, "loss": 0.927, "step": 2278 }, { "epoch": 0.9480279749369523, "grad_norm": 1.523148775100708, "learning_rate": 1.5970254812693638e-05, "loss": 0.765, "step": 2279 }, { "epoch": 0.948443959129553, "grad_norm": 1.3649770021438599, "learning_rate": 1.5966660009955834e-05, "loss": 0.8658, "step": 2280 }, { "epoch": 0.9488599433221537, "grad_norm": 1.5118075609207153, "learning_rate": 1.5963064009520326e-05, "loss": 0.9523, "step": 2281 }, { "epoch": 0.9492759275147544, "grad_norm": 71.80716705322266, "learning_rate": 1.595946681210895e-05, "loss": 0.844, "step": 2282 }, { "epoch": 0.9496919117073551, "grad_norm": 1.4919332265853882, "learning_rate": 1.5955868418443786e-05, "loss": 0.9588, "step": 2283 }, { "epoch": 0.9501078958999558, "grad_norm": 1.4334995746612549, "learning_rate": 1.595226882924713e-05, "loss": 0.845, "step": 2284 }, { "epoch": 0.9505238800925565, "grad_norm": 1.5609192848205566, "learning_rate": 1.5948668045241545e-05, "loss": 0.8716, "step": 2285 }, { "epoch": 0.9509398642851572, "grad_norm": 1.50559663772583, "learning_rate": 1.5945066067149822e-05, "loss": 0.8605, "step": 2286 }, { "epoch": 0.9513558484777579, "grad_norm": 1.4591847658157349, "learning_rate": 1.5941462895694986e-05, "loss": 0.8753, "step": 2287 }, { "epoch": 0.9517718326703585, "grad_norm": 1.5334174633026123, "learning_rate": 1.593785853160031e-05, "loss": 0.8172, "step": 2288 }, { "epoch": 0.9521878168629592, "grad_norm": 1.5918457508087158, "learning_rate": 1.5934252975589302e-05, "loss": 0.911, "step": 2289 }, { "epoch": 0.9526038010555599, "grad_norm": 1.5847667455673218, "learning_rate": 1.593064622838572e-05, "loss": 0.94, "step": 2290 }, { "epoch": 0.9530197852481606, "grad_norm": 1.5918911695480347, "learning_rate": 1.5927038290713545e-05, "loss": 0.9453, "step": 2291 }, { "epoch": 0.9534357694407612, "grad_norm": 1.5100101232528687, "learning_rate": 1.5923429163297005e-05, "loss": 0.9371, "step": 2292 }, { "epoch": 0.9538517536333619, "grad_norm": 1.5111278295516968, "learning_rate": 1.5919818846860565e-05, "loss": 0.8808, "step": 2293 }, { "epoch": 0.9542677378259626, "grad_norm": 1.6915651559829712, "learning_rate": 1.5916207342128934e-05, "loss": 0.9331, "step": 2294 }, { "epoch": 0.9546837220185633, "grad_norm": 1.5263279676437378, "learning_rate": 1.5912594649827053e-05, "loss": 0.8551, "step": 2295 }, { "epoch": 0.955099706211164, "grad_norm": 1.687901258468628, "learning_rate": 1.5908980770680105e-05, "loss": 0.8323, "step": 2296 }, { "epoch": 0.9555156904037646, "grad_norm": 1.523196816444397, "learning_rate": 1.5905365705413503e-05, "loss": 0.7263, "step": 2297 }, { "epoch": 0.9559316745963653, "grad_norm": 1.5519044399261475, "learning_rate": 1.590174945475292e-05, "loss": 0.9172, "step": 2298 }, { "epoch": 0.956347658788966, "grad_norm": 1.7933214902877808, "learning_rate": 1.5898132019424238e-05, "loss": 0.9465, "step": 2299 }, { "epoch": 0.9567636429815667, "grad_norm": 1.6573889255523682, "learning_rate": 1.5894513400153598e-05, "loss": 0.9304, "step": 2300 }, { "epoch": 0.9571796271741674, "grad_norm": 1.5709803104400635, "learning_rate": 1.589089359766737e-05, "loss": 0.757, "step": 2301 }, { "epoch": 0.9575956113667681, "grad_norm": 1.5966155529022217, "learning_rate": 1.588727261269216e-05, "loss": 0.8386, "step": 2302 }, { "epoch": 0.9580115955593688, "grad_norm": 1.5325875282287598, "learning_rate": 1.5883650445954822e-05, "loss": 0.9326, "step": 2303 }, { "epoch": 0.9584275797519695, "grad_norm": 1.627652883529663, "learning_rate": 1.5880027098182433e-05, "loss": 0.8706, "step": 2304 }, { "epoch": 0.9588435639445702, "grad_norm": 1.4552260637283325, "learning_rate": 1.587640257010232e-05, "loss": 0.9098, "step": 2305 }, { "epoch": 0.9592595481371707, "grad_norm": 1.6902717351913452, "learning_rate": 1.5872776862442028e-05, "loss": 0.9325, "step": 2306 }, { "epoch": 0.9596755323297714, "grad_norm": 1.6865758895874023, "learning_rate": 1.5869149975929364e-05, "loss": 0.8526, "step": 2307 }, { "epoch": 0.9600915165223721, "grad_norm": 1.490578293800354, "learning_rate": 1.5865521911292353e-05, "loss": 0.9314, "step": 2308 }, { "epoch": 0.9605075007149728, "grad_norm": 1.5770468711853027, "learning_rate": 1.5861892669259264e-05, "loss": 0.9129, "step": 2309 }, { "epoch": 0.9609234849075735, "grad_norm": 7.601449966430664, "learning_rate": 1.5858262250558603e-05, "loss": 0.8905, "step": 2310 }, { "epoch": 0.9613394691001742, "grad_norm": 1.589726448059082, "learning_rate": 1.5854630655919107e-05, "loss": 0.8496, "step": 2311 }, { "epoch": 0.9617554532927749, "grad_norm": 1.7273651361465454, "learning_rate": 1.5850997886069755e-05, "loss": 0.8374, "step": 2312 }, { "epoch": 0.9621714374853756, "grad_norm": 1.6315186023712158, "learning_rate": 1.5847363941739754e-05, "loss": 0.8062, "step": 2313 }, { "epoch": 0.9625874216779763, "grad_norm": 1.556193232536316, "learning_rate": 1.5843728823658553e-05, "loss": 0.9433, "step": 2314 }, { "epoch": 0.9630034058705769, "grad_norm": 1.6628758907318115, "learning_rate": 1.584009253255584e-05, "loss": 0.8701, "step": 2315 }, { "epoch": 0.9634193900631776, "grad_norm": 1.5644513368606567, "learning_rate": 1.5836455069161535e-05, "loss": 0.9191, "step": 2316 }, { "epoch": 0.9638353742557783, "grad_norm": 1.4896897077560425, "learning_rate": 1.5832816434205784e-05, "loss": 0.8517, "step": 2317 }, { "epoch": 0.964251358448379, "grad_norm": 1.7350571155548096, "learning_rate": 1.582917662841898e-05, "loss": 0.9303, "step": 2318 }, { "epoch": 0.9646673426409796, "grad_norm": 1.5186469554901123, "learning_rate": 1.582553565253175e-05, "loss": 0.8342, "step": 2319 }, { "epoch": 0.9650833268335803, "grad_norm": 1.5504176616668701, "learning_rate": 1.582189350727495e-05, "loss": 0.9338, "step": 2320 }, { "epoch": 0.965499311026181, "grad_norm": 1.5969303846359253, "learning_rate": 1.5818250193379676e-05, "loss": 0.941, "step": 2321 }, { "epoch": 0.9659152952187817, "grad_norm": 1.6575871706008911, "learning_rate": 1.5814605711577254e-05, "loss": 0.9331, "step": 2322 }, { "epoch": 0.9663312794113824, "grad_norm": 1.5539567470550537, "learning_rate": 1.581096006259925e-05, "loss": 0.7871, "step": 2323 }, { "epoch": 0.966747263603983, "grad_norm": 32.0183219909668, "learning_rate": 1.5807313247177457e-05, "loss": 0.8389, "step": 2324 }, { "epoch": 0.9671632477965837, "grad_norm": 1.5387465953826904, "learning_rate": 1.5803665266043914e-05, "loss": 0.9129, "step": 2325 }, { "epoch": 0.9675792319891844, "grad_norm": 1.6151412725448608, "learning_rate": 1.5800016119930878e-05, "loss": 0.8505, "step": 2326 }, { "epoch": 0.9679952161817851, "grad_norm": 1.5879162549972534, "learning_rate": 1.5796365809570853e-05, "loss": 0.9135, "step": 2327 }, { "epoch": 0.9684112003743858, "grad_norm": 1.50468909740448, "learning_rate": 1.579271433569657e-05, "loss": 0.9091, "step": 2328 }, { "epoch": 0.9688271845669865, "grad_norm": 1.6881616115570068, "learning_rate": 1.5789061699040995e-05, "loss": 0.9407, "step": 2329 }, { "epoch": 0.9692431687595872, "grad_norm": 1.4933116436004639, "learning_rate": 1.578540790033733e-05, "loss": 0.8851, "step": 2330 }, { "epoch": 0.9696591529521879, "grad_norm": 1.4813333749771118, "learning_rate": 1.5781752940319007e-05, "loss": 0.8867, "step": 2331 }, { "epoch": 0.9700751371447885, "grad_norm": 1.5165036916732788, "learning_rate": 1.5778096819719695e-05, "loss": 0.8134, "step": 2332 }, { "epoch": 0.9704911213373891, "grad_norm": 1.6927285194396973, "learning_rate": 1.577443953927329e-05, "loss": 0.9153, "step": 2333 }, { "epoch": 0.9709071055299898, "grad_norm": 1.5255653858184814, "learning_rate": 1.5770781099713924e-05, "loss": 0.8027, "step": 2334 }, { "epoch": 0.9713230897225905, "grad_norm": 229.27403259277344, "learning_rate": 1.5767121501775963e-05, "loss": 0.817, "step": 2335 }, { "epoch": 0.9717390739151912, "grad_norm": 1.5209635496139526, "learning_rate": 1.5763460746194003e-05, "loss": 0.805, "step": 2336 }, { "epoch": 0.9721550581077919, "grad_norm": 1.5404969453811646, "learning_rate": 1.5759798833702874e-05, "loss": 0.8062, "step": 2337 }, { "epoch": 0.9725710423003926, "grad_norm": 60.92780303955078, "learning_rate": 1.5756135765037643e-05, "loss": 0.7722, "step": 2338 }, { "epoch": 0.9729870264929933, "grad_norm": 1.5827875137329102, "learning_rate": 1.5752471540933593e-05, "loss": 0.8918, "step": 2339 }, { "epoch": 0.973403010685594, "grad_norm": 1.6905571222305298, "learning_rate": 1.574880616212626e-05, "loss": 0.8857, "step": 2340 }, { "epoch": 0.9738189948781947, "grad_norm": 1.6136163473129272, "learning_rate": 1.5745139629351395e-05, "loss": 0.9016, "step": 2341 }, { "epoch": 0.9742349790707953, "grad_norm": 1.6698077917099, "learning_rate": 1.5741471943344994e-05, "loss": 0.8191, "step": 2342 }, { "epoch": 0.974650963263396, "grad_norm": 1.49519944190979, "learning_rate": 1.5737803104843267e-05, "loss": 0.9401, "step": 2343 }, { "epoch": 0.9750669474559966, "grad_norm": 1.6062859296798706, "learning_rate": 1.5734133114582676e-05, "loss": 0.8856, "step": 2344 }, { "epoch": 0.9754829316485973, "grad_norm": 1.6245100498199463, "learning_rate": 1.5730461973299903e-05, "loss": 0.9196, "step": 2345 }, { "epoch": 0.975898915841198, "grad_norm": 1.6312309503555298, "learning_rate": 1.572678968173186e-05, "loss": 0.7166, "step": 2346 }, { "epoch": 0.9763149000337987, "grad_norm": 14.641081809997559, "learning_rate": 1.5723116240615693e-05, "loss": 0.8568, "step": 2347 }, { "epoch": 0.9767308842263994, "grad_norm": 2.672646999359131, "learning_rate": 1.571944165068878e-05, "loss": 0.8781, "step": 2348 }, { "epoch": 0.9771468684190001, "grad_norm": 1.558361291885376, "learning_rate": 1.571576591268872e-05, "loss": 0.8131, "step": 2349 }, { "epoch": 0.9775628526116008, "grad_norm": 25.29528045654297, "learning_rate": 1.5712089027353362e-05, "loss": 0.9582, "step": 2350 }, { "epoch": 0.9779788368042014, "grad_norm": 1.5128881931304932, "learning_rate": 1.570841099542076e-05, "loss": 0.8014, "step": 2351 }, { "epoch": 0.9783948209968021, "grad_norm": 1.58877694606781, "learning_rate": 1.5704731817629225e-05, "loss": 0.8427, "step": 2352 }, { "epoch": 0.9788108051894028, "grad_norm": 1.649539828300476, "learning_rate": 1.5701051494717278e-05, "loss": 0.8887, "step": 2353 }, { "epoch": 0.9792267893820035, "grad_norm": 1.6796523332595825, "learning_rate": 1.5697370027423672e-05, "loss": 0.8584, "step": 2354 }, { "epoch": 0.9796427735746042, "grad_norm": 1.6113214492797852, "learning_rate": 1.56936874164874e-05, "loss": 0.8925, "step": 2355 }, { "epoch": 0.9800587577672049, "grad_norm": 1.6490601301193237, "learning_rate": 1.569000366264768e-05, "loss": 0.8573, "step": 2356 }, { "epoch": 0.9804747419598056, "grad_norm": 1.7214421033859253, "learning_rate": 1.5686318766643952e-05, "loss": 0.8473, "step": 2357 }, { "epoch": 0.9808907261524062, "grad_norm": 1.6338930130004883, "learning_rate": 1.5682632729215894e-05, "loss": 0.9191, "step": 2358 }, { "epoch": 0.9813067103450069, "grad_norm": 1.406484603881836, "learning_rate": 1.5678945551103413e-05, "loss": 0.7507, "step": 2359 }, { "epoch": 0.9817226945376075, "grad_norm": 1.6109747886657715, "learning_rate": 1.5675257233046643e-05, "loss": 0.9015, "step": 2360 }, { "epoch": 0.9821386787302082, "grad_norm": 1.4685695171356201, "learning_rate": 1.5671567775785937e-05, "loss": 0.8675, "step": 2361 }, { "epoch": 0.9825546629228089, "grad_norm": 1.5080066919326782, "learning_rate": 1.5667877180061898e-05, "loss": 0.9166, "step": 2362 }, { "epoch": 0.9829706471154096, "grad_norm": 1.4343678951263428, "learning_rate": 1.5664185446615344e-05, "loss": 0.8389, "step": 2363 }, { "epoch": 0.9833866313080103, "grad_norm": 1.7572903633117676, "learning_rate": 1.5660492576187313e-05, "loss": 0.8872, "step": 2364 }, { "epoch": 0.983802615500611, "grad_norm": 1.5281026363372803, "learning_rate": 1.565679856951909e-05, "loss": 0.8134, "step": 2365 }, { "epoch": 0.9842185996932117, "grad_norm": 1.653788447380066, "learning_rate": 1.5653103427352177e-05, "loss": 0.8579, "step": 2366 }, { "epoch": 0.9846345838858124, "grad_norm": 2.792950391769409, "learning_rate": 1.5649407150428303e-05, "loss": 0.9594, "step": 2367 }, { "epoch": 0.9850505680784131, "grad_norm": 182.95982360839844, "learning_rate": 1.5645709739489436e-05, "loss": 0.8283, "step": 2368 }, { "epoch": 0.9854665522710137, "grad_norm": 1.5512362718582153, "learning_rate": 1.564201119527775e-05, "loss": 0.9096, "step": 2369 }, { "epoch": 0.9858825364636143, "grad_norm": 467.4891052246094, "learning_rate": 1.5638311518535677e-05, "loss": 0.8787, "step": 2370 }, { "epoch": 0.986298520656215, "grad_norm": 1.6795307397842407, "learning_rate": 1.5634610710005844e-05, "loss": 0.9117, "step": 2371 }, { "epoch": 0.9867145048488157, "grad_norm": 1.6208604574203491, "learning_rate": 1.5630908770431128e-05, "loss": 0.8724, "step": 2372 }, { "epoch": 0.9871304890414164, "grad_norm": 329.30694580078125, "learning_rate": 1.5627205700554627e-05, "loss": 0.9394, "step": 2373 }, { "epoch": 0.9875464732340171, "grad_norm": 1.6881331205368042, "learning_rate": 1.5623501501119657e-05, "loss": 0.8487, "step": 2374 }, { "epoch": 0.9879624574266178, "grad_norm": 6.918861389160156, "learning_rate": 1.5619796172869777e-05, "loss": 0.9987, "step": 2375 }, { "epoch": 0.9883784416192185, "grad_norm": 1.6174031496047974, "learning_rate": 1.561608971654876e-05, "loss": 0.8625, "step": 2376 }, { "epoch": 0.9887944258118192, "grad_norm": 7.411485195159912, "learning_rate": 1.561238213290061e-05, "loss": 0.9447, "step": 2377 }, { "epoch": 0.9892104100044198, "grad_norm": 1.522835612297058, "learning_rate": 1.560867342266955e-05, "loss": 0.9073, "step": 2378 }, { "epoch": 0.9896263941970205, "grad_norm": 1.4016748666763306, "learning_rate": 1.5604963586600046e-05, "loss": 0.7065, "step": 2379 }, { "epoch": 0.9900423783896212, "grad_norm": 1.71564781665802, "learning_rate": 1.5601252625436773e-05, "loss": 0.9828, "step": 2380 }, { "epoch": 0.9904583625822219, "grad_norm": 7.537070274353027, "learning_rate": 1.5597540539924642e-05, "loss": 0.8509, "step": 2381 }, { "epoch": 0.9908743467748226, "grad_norm": 1.4433631896972656, "learning_rate": 1.5593827330808786e-05, "loss": 0.8082, "step": 2382 }, { "epoch": 0.9912903309674232, "grad_norm": 1.639976143836975, "learning_rate": 1.5590112998834558e-05, "loss": 0.8075, "step": 2383 }, { "epoch": 0.9917063151600239, "grad_norm": 1.5137635469436646, "learning_rate": 1.5586397544747548e-05, "loss": 0.7602, "step": 2384 }, { "epoch": 0.9921222993526246, "grad_norm": 628.844970703125, "learning_rate": 1.5582680969293565e-05, "loss": 0.8197, "step": 2385 }, { "epoch": 0.9925382835452253, "grad_norm": 1.5508710145950317, "learning_rate": 1.557896327321864e-05, "loss": 0.8195, "step": 2386 }, { "epoch": 0.9929542677378259, "grad_norm": 1.610579252243042, "learning_rate": 1.5575244457269036e-05, "loss": 0.9853, "step": 2387 }, { "epoch": 0.9933702519304266, "grad_norm": 1.488541841506958, "learning_rate": 1.5571524522191235e-05, "loss": 0.8521, "step": 2388 }, { "epoch": 0.9937862361230273, "grad_norm": 1.62030827999115, "learning_rate": 1.5567803468731945e-05, "loss": 1.021, "step": 2389 }, { "epoch": 0.994202220315628, "grad_norm": 1.562685489654541, "learning_rate": 1.55640812976381e-05, "loss": 0.8964, "step": 2390 }, { "epoch": 0.9946182045082287, "grad_norm": 1.4914287328720093, "learning_rate": 1.5560358009656863e-05, "loss": 0.8056, "step": 2391 }, { "epoch": 0.9950341887008294, "grad_norm": 1.9172981977462769, "learning_rate": 1.555663360553561e-05, "loss": 0.9596, "step": 2392 }, { "epoch": 0.9954501728934301, "grad_norm": 1.3902288675308228, "learning_rate": 1.5552908086021945e-05, "loss": 0.9058, "step": 2393 }, { "epoch": 0.9958661570860308, "grad_norm": 1.561497688293457, "learning_rate": 1.5549181451863703e-05, "loss": 0.9192, "step": 2394 }, { "epoch": 0.9962821412786315, "grad_norm": 1.3737318515777588, "learning_rate": 1.5545453703808936e-05, "loss": 0.8555, "step": 2395 }, { "epoch": 0.996698125471232, "grad_norm": 1.5880844593048096, "learning_rate": 1.5541724842605922e-05, "loss": 0.8813, "step": 2396 }, { "epoch": 0.9971141096638327, "grad_norm": 1.573256492614746, "learning_rate": 1.553799486900316e-05, "loss": 0.9504, "step": 2397 }, { "epoch": 0.9975300938564334, "grad_norm": 37.1464729309082, "learning_rate": 1.553426378374937e-05, "loss": 0.8354, "step": 2398 }, { "epoch": 0.9979460780490341, "grad_norm": 1.5325430631637573, "learning_rate": 1.553053158759351e-05, "loss": 1.0011, "step": 2399 }, { "epoch": 0.9983620622416348, "grad_norm": 1.4689385890960693, "learning_rate": 1.5526798281284737e-05, "loss": 0.8765, "step": 2400 }, { "epoch": 0.9987780464342355, "grad_norm": 1.6709831953048706, "learning_rate": 1.552306386557245e-05, "loss": 0.9154, "step": 2401 }, { "epoch": 0.9991940306268362, "grad_norm": 1.6455554962158203, "learning_rate": 1.5519328341206273e-05, "loss": 0.9511, "step": 2402 }, { "epoch": 0.9996100148194369, "grad_norm": 1.5892279148101807, "learning_rate": 1.5515591708936026e-05, "loss": 0.8627, "step": 2403 }, { "epoch": 1.0004159841926006, "grad_norm": 2.222154140472412, "learning_rate": 1.5511853969511785e-05, "loss": 1.6863, "step": 2404 }, { "epoch": 1.0008319683852014, "grad_norm": 1.4897576570510864, "learning_rate": 1.550811512368382e-05, "loss": 0.7578, "step": 2405 }, { "epoch": 1.001247952577802, "grad_norm": 1.619345784187317, "learning_rate": 1.550437517220265e-05, "loss": 0.8617, "step": 2406 }, { "epoch": 1.0016639367704028, "grad_norm": 1.522023320198059, "learning_rate": 1.550063411581899e-05, "loss": 0.7227, "step": 2407 }, { "epoch": 1.0020799209630034, "grad_norm": 1.5441606044769287, "learning_rate": 1.5496891955283798e-05, "loss": 0.861, "step": 2408 }, { "epoch": 1.0024959051556042, "grad_norm": 1.4940234422683716, "learning_rate": 1.5493148691348238e-05, "loss": 0.9036, "step": 2409 }, { "epoch": 1.0029118893482047, "grad_norm": 1.6916579008102417, "learning_rate": 1.54894043247637e-05, "loss": 0.8268, "step": 2410 }, { "epoch": 1.0033278735408055, "grad_norm": 1.515088677406311, "learning_rate": 1.5485658856281796e-05, "loss": 0.8293, "step": 2411 }, { "epoch": 1.0037438577334061, "grad_norm": 1.5796736478805542, "learning_rate": 1.5481912286654372e-05, "loss": 0.8605, "step": 2412 }, { "epoch": 1.0041598419260067, "grad_norm": 1.6369599103927612, "learning_rate": 1.5478164616633472e-05, "loss": 0.8232, "step": 2413 }, { "epoch": 1.0045758261186075, "grad_norm": 1.5732718706130981, "learning_rate": 1.5474415846971375e-05, "loss": 0.8131, "step": 2414 }, { "epoch": 1.004991810311208, "grad_norm": 1.5621263980865479, "learning_rate": 1.547066597842058e-05, "loss": 0.7909, "step": 2415 }, { "epoch": 1.005407794503809, "grad_norm": 1.6391491889953613, "learning_rate": 1.54669150117338e-05, "loss": 0.7588, "step": 2416 }, { "epoch": 1.0058237786964095, "grad_norm": 1.4718389511108398, "learning_rate": 1.5463162947663976e-05, "loss": 0.716, "step": 2417 }, { "epoch": 1.0062397628890103, "grad_norm": 1.798094391822815, "learning_rate": 1.5459409786964265e-05, "loss": 0.8345, "step": 2418 }, { "epoch": 1.0066557470816109, "grad_norm": 1.557714819908142, "learning_rate": 1.5455655530388046e-05, "loss": 0.7438, "step": 2419 }, { "epoch": 1.0070717312742117, "grad_norm": 1.597467303276062, "learning_rate": 1.5451900178688915e-05, "loss": 0.9091, "step": 2420 }, { "epoch": 1.0074877154668123, "grad_norm": 2.0066771507263184, "learning_rate": 1.54481437326207e-05, "loss": 0.8856, "step": 2421 }, { "epoch": 1.0079036996594128, "grad_norm": 1.4188450574874878, "learning_rate": 1.5444386192937425e-05, "loss": 0.7032, "step": 2422 }, { "epoch": 1.0083196838520136, "grad_norm": 1.578132152557373, "learning_rate": 1.5440627560393355e-05, "loss": 0.8322, "step": 2423 }, { "epoch": 1.0087356680446142, "grad_norm": 1.6121646165847778, "learning_rate": 1.5436867835742965e-05, "loss": 0.7826, "step": 2424 }, { "epoch": 1.009151652237215, "grad_norm": 1.5340831279754639, "learning_rate": 1.543310701974095e-05, "loss": 0.8052, "step": 2425 }, { "epoch": 1.0095676364298156, "grad_norm": 1.5576053857803345, "learning_rate": 1.5429345113142224e-05, "loss": 0.8111, "step": 2426 }, { "epoch": 1.0099836206224164, "grad_norm": 4.35866641998291, "learning_rate": 1.5425582116701924e-05, "loss": 0.8245, "step": 2427 }, { "epoch": 1.010399604815017, "grad_norm": 1.5219391584396362, "learning_rate": 1.54218180311754e-05, "loss": 0.8868, "step": 2428 }, { "epoch": 1.0108155890076178, "grad_norm": 1.653082251548767, "learning_rate": 1.5418052857318224e-05, "loss": 0.8569, "step": 2429 }, { "epoch": 1.0112315732002184, "grad_norm": 1.6662026643753052, "learning_rate": 1.5414286595886185e-05, "loss": 0.8967, "step": 2430 }, { "epoch": 1.011647557392819, "grad_norm": 1.6310101747512817, "learning_rate": 1.5410519247635293e-05, "loss": 0.8298, "step": 2431 }, { "epoch": 1.0120635415854198, "grad_norm": 1.5650633573532104, "learning_rate": 1.5406750813321774e-05, "loss": 0.8228, "step": 2432 }, { "epoch": 1.0124795257780204, "grad_norm": 1.6161893606185913, "learning_rate": 1.540298129370207e-05, "loss": 0.9161, "step": 2433 }, { "epoch": 1.0128955099706212, "grad_norm": 1.5956910848617554, "learning_rate": 1.5399210689532846e-05, "loss": 0.8609, "step": 2434 }, { "epoch": 1.0133114941632217, "grad_norm": 1.4591336250305176, "learning_rate": 1.5395439001570977e-05, "loss": 0.7929, "step": 2435 }, { "epoch": 1.0137274783558226, "grad_norm": 1.6039633750915527, "learning_rate": 1.5391666230573567e-05, "loss": 0.9754, "step": 2436 }, { "epoch": 1.0141434625484231, "grad_norm": 1.5714176893234253, "learning_rate": 1.5387892377297925e-05, "loss": 0.8533, "step": 2437 }, { "epoch": 1.014559446741024, "grad_norm": 1.5687692165374756, "learning_rate": 1.538411744250158e-05, "loss": 0.8399, "step": 2438 }, { "epoch": 1.0149754309336245, "grad_norm": 1.5534931421279907, "learning_rate": 1.5380341426942293e-05, "loss": 0.8835, "step": 2439 }, { "epoch": 1.015391415126225, "grad_norm": 1.5390551090240479, "learning_rate": 1.5376564331378018e-05, "loss": 0.7677, "step": 2440 }, { "epoch": 1.015807399318826, "grad_norm": 1.619646668434143, "learning_rate": 1.5372786156566947e-05, "loss": 0.839, "step": 2441 }, { "epoch": 1.0162233835114265, "grad_norm": 1.6832555532455444, "learning_rate": 1.5369006903267473e-05, "loss": 0.8836, "step": 2442 }, { "epoch": 1.0166393677040273, "grad_norm": 1.6479558944702148, "learning_rate": 1.536522657223821e-05, "loss": 0.8678, "step": 2443 }, { "epoch": 1.0170553518966279, "grad_norm": 1.6129292249679565, "learning_rate": 1.5361445164237996e-05, "loss": 0.8419, "step": 2444 }, { "epoch": 1.0174713360892287, "grad_norm": 1.6803637742996216, "learning_rate": 1.535766268002588e-05, "loss": 0.9377, "step": 2445 }, { "epoch": 1.0178873202818293, "grad_norm": 233.0509490966797, "learning_rate": 1.535387912036112e-05, "loss": 0.8242, "step": 2446 }, { "epoch": 1.01830330447443, "grad_norm": 1.6957204341888428, "learning_rate": 1.5350094486003203e-05, "loss": 0.9312, "step": 2447 }, { "epoch": 1.0187192886670307, "grad_norm": 1.7500593662261963, "learning_rate": 1.5346308777711823e-05, "loss": 0.9271, "step": 2448 }, { "epoch": 1.0191352728596312, "grad_norm": 1.7331523895263672, "learning_rate": 1.534252199624689e-05, "loss": 0.9089, "step": 2449 }, { "epoch": 1.019551257052232, "grad_norm": 1.7087669372558594, "learning_rate": 1.5338734142368528e-05, "loss": 0.8894, "step": 2450 }, { "epoch": 1.0199672412448326, "grad_norm": 1.5375769138336182, "learning_rate": 1.5334945216837085e-05, "loss": 0.7987, "step": 2451 }, { "epoch": 1.0203832254374334, "grad_norm": 1.5540390014648438, "learning_rate": 1.5331155220413118e-05, "loss": 0.8735, "step": 2452 }, { "epoch": 1.020799209630034, "grad_norm": 1.5802761316299438, "learning_rate": 1.5327364153857392e-05, "loss": 0.8006, "step": 2453 }, { "epoch": 1.0212151938226348, "grad_norm": 1659.4075927734375, "learning_rate": 1.532357201793091e-05, "loss": 0.8562, "step": 2454 }, { "epoch": 1.0216311780152354, "grad_norm": 1.7363922595977783, "learning_rate": 1.5319778813394855e-05, "loss": 0.9268, "step": 2455 }, { "epoch": 1.0220471622078362, "grad_norm": 1.6159989833831787, "learning_rate": 1.5315984541010657e-05, "loss": 0.8891, "step": 2456 }, { "epoch": 1.0224631464004368, "grad_norm": 1.6484638452529907, "learning_rate": 1.5312189201539935e-05, "loss": 0.8928, "step": 2457 }, { "epoch": 1.0228791305930374, "grad_norm": 1.5309357643127441, "learning_rate": 1.5308392795744545e-05, "loss": 0.9244, "step": 2458 }, { "epoch": 1.0232951147856382, "grad_norm": 145.97215270996094, "learning_rate": 1.5304595324386543e-05, "loss": 0.7653, "step": 2459 }, { "epoch": 1.0237110989782388, "grad_norm": 1.6922674179077148, "learning_rate": 1.53007967882282e-05, "loss": 0.8184, "step": 2460 }, { "epoch": 1.0241270831708396, "grad_norm": 1.8256422281265259, "learning_rate": 1.5296997188032e-05, "loss": 0.8781, "step": 2461 }, { "epoch": 1.0245430673634401, "grad_norm": 1.6213434934616089, "learning_rate": 1.529319652456065e-05, "loss": 0.8211, "step": 2462 }, { "epoch": 1.024959051556041, "grad_norm": 1.5929030179977417, "learning_rate": 1.5289394798577055e-05, "loss": 0.8098, "step": 2463 }, { "epoch": 1.0253750357486415, "grad_norm": 1.8870551586151123, "learning_rate": 1.528559201084435e-05, "loss": 0.8775, "step": 2464 }, { "epoch": 1.0257910199412423, "grad_norm": 1.6870983839035034, "learning_rate": 1.5281788162125867e-05, "loss": 0.8121, "step": 2465 }, { "epoch": 1.026207004133843, "grad_norm": 1.539736270904541, "learning_rate": 1.5277983253185167e-05, "loss": 0.7947, "step": 2466 }, { "epoch": 1.0266229883264435, "grad_norm": 1.5680716037750244, "learning_rate": 1.527417728478601e-05, "loss": 0.7982, "step": 2467 }, { "epoch": 1.0270389725190443, "grad_norm": 11.262317657470703, "learning_rate": 1.527037025769238e-05, "loss": 0.9443, "step": 2468 }, { "epoch": 1.0274549567116449, "grad_norm": 3.212878704071045, "learning_rate": 1.5266562172668462e-05, "loss": 0.7676, "step": 2469 }, { "epoch": 1.0278709409042457, "grad_norm": 1.7448810338974, "learning_rate": 1.526275303047866e-05, "loss": 0.8737, "step": 2470 }, { "epoch": 1.0282869250968463, "grad_norm": 1.7036181688308716, "learning_rate": 1.5258942831887594e-05, "loss": 0.8657, "step": 2471 }, { "epoch": 1.028702909289447, "grad_norm": 1.75706946849823, "learning_rate": 1.5255131577660089e-05, "loss": 0.9089, "step": 2472 }, { "epoch": 1.0291188934820477, "grad_norm": 1.7899796962738037, "learning_rate": 1.5251319268561182e-05, "loss": 0.8798, "step": 2473 }, { "epoch": 1.0295348776746485, "grad_norm": 1.5200163125991821, "learning_rate": 1.524750590535613e-05, "loss": 0.6966, "step": 2474 }, { "epoch": 1.029950861867249, "grad_norm": 1.6935291290283203, "learning_rate": 1.5243691488810391e-05, "loss": 0.8151, "step": 2475 }, { "epoch": 1.0303668460598496, "grad_norm": 1.7324973344802856, "learning_rate": 1.5239876019689639e-05, "loss": 0.827, "step": 2476 }, { "epoch": 1.0307828302524504, "grad_norm": 1.6937559843063354, "learning_rate": 1.5236059498759761e-05, "loss": 0.9332, "step": 2477 }, { "epoch": 1.031198814445051, "grad_norm": 1.559435486793518, "learning_rate": 1.5232241926786854e-05, "loss": 0.9051, "step": 2478 }, { "epoch": 1.0316147986376518, "grad_norm": 1.5569963455200195, "learning_rate": 1.522842330453723e-05, "loss": 0.8403, "step": 2479 }, { "epoch": 1.0320307828302524, "grad_norm": 1.4945347309112549, "learning_rate": 1.5224603632777398e-05, "loss": 0.8273, "step": 2480 }, { "epoch": 1.0324467670228532, "grad_norm": 1.7272353172302246, "learning_rate": 1.5220782912274092e-05, "loss": 0.9249, "step": 2481 }, { "epoch": 1.0328627512154538, "grad_norm": 1.6846286058425903, "learning_rate": 1.5216961143794253e-05, "loss": 0.8605, "step": 2482 }, { "epoch": 1.0332787354080546, "grad_norm": 1.5559496879577637, "learning_rate": 1.521313832810503e-05, "loss": 0.7696, "step": 2483 }, { "epoch": 1.0336947196006552, "grad_norm": 1.6083331108093262, "learning_rate": 1.5209314465973783e-05, "loss": 0.8534, "step": 2484 }, { "epoch": 1.0341107037932558, "grad_norm": 1.7574533224105835, "learning_rate": 1.5205489558168083e-05, "loss": 0.9404, "step": 2485 }, { "epoch": 1.0345266879858566, "grad_norm": 1.5947961807250977, "learning_rate": 1.520166360545571e-05, "loss": 0.873, "step": 2486 }, { "epoch": 1.0349426721784571, "grad_norm": 1.5349831581115723, "learning_rate": 1.519783660860465e-05, "loss": 0.9112, "step": 2487 }, { "epoch": 1.035358656371058, "grad_norm": 22.05772590637207, "learning_rate": 1.5194008568383107e-05, "loss": 0.7073, "step": 2488 }, { "epoch": 1.0357746405636585, "grad_norm": 1.5980616807937622, "learning_rate": 1.5190179485559492e-05, "loss": 0.8669, "step": 2489 }, { "epoch": 1.0361906247562593, "grad_norm": 1.6354937553405762, "learning_rate": 1.5186349360902419e-05, "loss": 0.8638, "step": 2490 }, { "epoch": 1.03660660894886, "grad_norm": 1.6736466884613037, "learning_rate": 1.5182518195180713e-05, "loss": 0.8172, "step": 2491 }, { "epoch": 1.0370225931414607, "grad_norm": 1.5409250259399414, "learning_rate": 1.5178685989163415e-05, "loss": 0.9327, "step": 2492 }, { "epoch": 1.0374385773340613, "grad_norm": 1.5642772912979126, "learning_rate": 1.517485274361977e-05, "loss": 0.9019, "step": 2493 }, { "epoch": 1.037854561526662, "grad_norm": 1.6266908645629883, "learning_rate": 1.5171018459319232e-05, "loss": 0.9398, "step": 2494 }, { "epoch": 1.0382705457192627, "grad_norm": 1.5757523775100708, "learning_rate": 1.516718313703146e-05, "loss": 0.7908, "step": 2495 }, { "epoch": 1.0386865299118633, "grad_norm": 1.5984429121017456, "learning_rate": 1.5163346777526325e-05, "loss": 0.7934, "step": 2496 }, { "epoch": 1.039102514104464, "grad_norm": 1.6945266723632812, "learning_rate": 1.5159509381573908e-05, "loss": 0.9751, "step": 2497 }, { "epoch": 1.0395184982970647, "grad_norm": 67.91423797607422, "learning_rate": 1.51556709499445e-05, "loss": 0.8116, "step": 2498 }, { "epoch": 1.0399344824896655, "grad_norm": 1.7240127325057983, "learning_rate": 1.5151831483408587e-05, "loss": 0.8971, "step": 2499 }, { "epoch": 1.040350466682266, "grad_norm": 9.36874771118164, "learning_rate": 1.5147990982736879e-05, "loss": 0.7704, "step": 2500 }, { "epoch": 1.040350466682266, "eval_loss": 0.7974026203155518, "eval_runtime": 1809.4646, "eval_samples_per_second": 3.643, "eval_steps_per_second": 1.822, "step": 2500 }, { "epoch": 1.0407664508748669, "grad_norm": 1.603265404701233, "learning_rate": 1.5144149448700283e-05, "loss": 0.8102, "step": 2501 }, { "epoch": 1.0411824350674674, "grad_norm": 29.339744567871094, "learning_rate": 1.5140306882069916e-05, "loss": 0.874, "step": 2502 }, { "epoch": 1.041598419260068, "grad_norm": 1.5500779151916504, "learning_rate": 1.5136463283617102e-05, "loss": 0.8863, "step": 2503 }, { "epoch": 1.0420144034526688, "grad_norm": 1.7024166584014893, "learning_rate": 1.5132618654113378e-05, "loss": 0.8511, "step": 2504 }, { "epoch": 1.0424303876452694, "grad_norm": 1.6909143924713135, "learning_rate": 1.5128772994330476e-05, "loss": 0.8914, "step": 2505 }, { "epoch": 1.0428463718378702, "grad_norm": 1.8089430332183838, "learning_rate": 1.5124926305040347e-05, "loss": 0.894, "step": 2506 }, { "epoch": 1.0432623560304708, "grad_norm": 1.463383436203003, "learning_rate": 1.5121078587015142e-05, "loss": 0.8844, "step": 2507 }, { "epoch": 1.0436783402230716, "grad_norm": 1.609634518623352, "learning_rate": 1.5117229841027219e-05, "loss": 0.8457, "step": 2508 }, { "epoch": 1.0440943244156722, "grad_norm": 1.5498086214065552, "learning_rate": 1.5113380067849144e-05, "loss": 0.8322, "step": 2509 }, { "epoch": 1.044510308608273, "grad_norm": 446.761474609375, "learning_rate": 1.5109529268253687e-05, "loss": 0.7974, "step": 2510 }, { "epoch": 1.0449262928008736, "grad_norm": 1.642881989479065, "learning_rate": 1.5105677443013826e-05, "loss": 0.8598, "step": 2511 }, { "epoch": 1.0453422769934742, "grad_norm": 1.6743990182876587, "learning_rate": 1.5101824592902747e-05, "loss": 0.7783, "step": 2512 }, { "epoch": 1.045758261186075, "grad_norm": 1.7232762575149536, "learning_rate": 1.509797071869384e-05, "loss": 0.9033, "step": 2513 }, { "epoch": 1.0461742453786755, "grad_norm": 1.6504367589950562, "learning_rate": 1.5094115821160693e-05, "loss": 0.8453, "step": 2514 }, { "epoch": 1.0465902295712763, "grad_norm": 1.8740153312683105, "learning_rate": 1.5090259901077112e-05, "loss": 0.8187, "step": 2515 }, { "epoch": 1.047006213763877, "grad_norm": 1.823381781578064, "learning_rate": 1.5086402959217097e-05, "loss": 1.0213, "step": 2516 }, { "epoch": 1.0474221979564777, "grad_norm": 126.30074310302734, "learning_rate": 1.5082544996354865e-05, "loss": 0.8377, "step": 2517 }, { "epoch": 1.0478381821490783, "grad_norm": 1.6638832092285156, "learning_rate": 1.507868601326483e-05, "loss": 0.8832, "step": 2518 }, { "epoch": 1.0482541663416791, "grad_norm": 1.6215579509735107, "learning_rate": 1.5074826010721605e-05, "loss": 0.8038, "step": 2519 }, { "epoch": 1.0486701505342797, "grad_norm": 1.7244694232940674, "learning_rate": 1.5070964989500023e-05, "loss": 0.9297, "step": 2520 }, { "epoch": 1.0490861347268803, "grad_norm": 1.61472487449646, "learning_rate": 1.506710295037511e-05, "loss": 0.8531, "step": 2521 }, { "epoch": 1.049502118919481, "grad_norm": 2.818415880203247, "learning_rate": 1.5063239894122099e-05, "loss": 0.7808, "step": 2522 }, { "epoch": 1.0499181031120817, "grad_norm": 1.6993930339813232, "learning_rate": 1.5059375821516429e-05, "loss": 0.8682, "step": 2523 }, { "epoch": 1.0503340873046825, "grad_norm": 1.6114345788955688, "learning_rate": 1.5055510733333742e-05, "loss": 0.8407, "step": 2524 }, { "epoch": 1.050750071497283, "grad_norm": 2.9676692485809326, "learning_rate": 1.5051644630349883e-05, "loss": 0.7674, "step": 2525 }, { "epoch": 1.0511660556898839, "grad_norm": 43.39280319213867, "learning_rate": 1.50477775133409e-05, "loss": 0.7767, "step": 2526 }, { "epoch": 1.0515820398824844, "grad_norm": 3.8380472660064697, "learning_rate": 1.5043909383083049e-05, "loss": 0.8669, "step": 2527 }, { "epoch": 1.0519980240750852, "grad_norm": 2.6330947875976562, "learning_rate": 1.5040040240352786e-05, "loss": 0.9596, "step": 2528 }, { "epoch": 1.0524140082676858, "grad_norm": 1.7455490827560425, "learning_rate": 1.503617008592677e-05, "loss": 0.9302, "step": 2529 }, { "epoch": 1.0528299924602864, "grad_norm": 1.6909734010696411, "learning_rate": 1.5032298920581858e-05, "loss": 0.9703, "step": 2530 }, { "epoch": 1.0532459766528872, "grad_norm": 1.5849249362945557, "learning_rate": 1.5028426745095123e-05, "loss": 0.787, "step": 2531 }, { "epoch": 1.0536619608454878, "grad_norm": 618719.9375, "learning_rate": 1.5024553560243834e-05, "loss": 0.824, "step": 2532 }, { "epoch": 1.0540779450380886, "grad_norm": 1.5474075078964233, "learning_rate": 1.5020679366805455e-05, "loss": 0.8253, "step": 2533 }, { "epoch": 1.0544939292306892, "grad_norm": 1.6025755405426025, "learning_rate": 1.5016804165557664e-05, "loss": 0.8224, "step": 2534 }, { "epoch": 1.05490991342329, "grad_norm": 1.6936179399490356, "learning_rate": 1.5012927957278335e-05, "loss": 0.8136, "step": 2535 }, { "epoch": 1.0553258976158906, "grad_norm": 1.5298932790756226, "learning_rate": 1.5009050742745544e-05, "loss": 0.798, "step": 2536 }, { "epoch": 1.0557418818084914, "grad_norm": 1.697591781616211, "learning_rate": 1.5005172522737574e-05, "loss": 0.9444, "step": 2537 }, { "epoch": 1.056157866001092, "grad_norm": 1.5952829122543335, "learning_rate": 1.5001293298032909e-05, "loss": 0.8431, "step": 2538 }, { "epoch": 1.0565738501936925, "grad_norm": 1.8618687391281128, "learning_rate": 1.4997413069410227e-05, "loss": 0.9084, "step": 2539 }, { "epoch": 1.0569898343862933, "grad_norm": 1.649682879447937, "learning_rate": 1.4993531837648416e-05, "loss": 0.8759, "step": 2540 }, { "epoch": 1.057405818578894, "grad_norm": 1.5617784261703491, "learning_rate": 1.4989649603526559e-05, "loss": 0.7444, "step": 2541 }, { "epoch": 1.0578218027714947, "grad_norm": 1.7252062559127808, "learning_rate": 1.4985766367823949e-05, "loss": 0.9055, "step": 2542 }, { "epoch": 1.0582377869640953, "grad_norm": 1.4912405014038086, "learning_rate": 1.4981882131320067e-05, "loss": 0.774, "step": 2543 }, { "epoch": 1.0586537711566961, "grad_norm": 1.6247303485870361, "learning_rate": 1.4977996894794608e-05, "loss": 0.9844, "step": 2544 }, { "epoch": 1.0590697553492967, "grad_norm": 1.4541634321212769, "learning_rate": 1.4974110659027461e-05, "loss": 0.7885, "step": 2545 }, { "epoch": 1.0594857395418975, "grad_norm": 1.8903878927230835, "learning_rate": 1.4970223424798716e-05, "loss": 0.9437, "step": 2546 }, { "epoch": 1.059901723734498, "grad_norm": 2.045743227005005, "learning_rate": 1.4966335192888664e-05, "loss": 0.9238, "step": 2547 }, { "epoch": 1.0603177079270987, "grad_norm": 1.80538809299469, "learning_rate": 1.4962445964077797e-05, "loss": 0.8984, "step": 2548 }, { "epoch": 1.0607336921196995, "grad_norm": 1.618675708770752, "learning_rate": 1.4958555739146803e-05, "loss": 0.8174, "step": 2549 }, { "epoch": 1.0611496763123, "grad_norm": 1.5828038454055786, "learning_rate": 1.4954664518876579e-05, "loss": 0.8897, "step": 2550 }, { "epoch": 1.0615656605049009, "grad_norm": 1.581549882888794, "learning_rate": 1.4950772304048212e-05, "loss": 0.7949, "step": 2551 }, { "epoch": 1.0619816446975014, "grad_norm": 1.5780601501464844, "learning_rate": 1.4946879095442992e-05, "loss": 0.8042, "step": 2552 }, { "epoch": 1.0623976288901023, "grad_norm": 1.6278681755065918, "learning_rate": 1.4942984893842416e-05, "loss": 0.8553, "step": 2553 }, { "epoch": 1.0628136130827028, "grad_norm": 1.5411697626113892, "learning_rate": 1.4939089700028166e-05, "loss": 0.8213, "step": 2554 }, { "epoch": 1.0632295972753036, "grad_norm": 1.607146143913269, "learning_rate": 1.4935193514782136e-05, "loss": 0.8013, "step": 2555 }, { "epoch": 1.0636455814679042, "grad_norm": 1.5760705471038818, "learning_rate": 1.493129633888641e-05, "loss": 0.8956, "step": 2556 }, { "epoch": 1.0640615656605048, "grad_norm": 1.6651555299758911, "learning_rate": 1.4927398173123277e-05, "loss": 0.8529, "step": 2557 }, { "epoch": 1.0644775498531056, "grad_norm": 1.9250460863113403, "learning_rate": 1.4923499018275223e-05, "loss": 0.8339, "step": 2558 }, { "epoch": 1.0648935340457062, "grad_norm": 1.5805623531341553, "learning_rate": 1.491959887512493e-05, "loss": 0.8477, "step": 2559 }, { "epoch": 1.065309518238307, "grad_norm": 165.92266845703125, "learning_rate": 1.4915697744455279e-05, "loss": 1.0004, "step": 2560 }, { "epoch": 1.0657255024309076, "grad_norm": 1.523746132850647, "learning_rate": 1.4911795627049355e-05, "loss": 0.7331, "step": 2561 }, { "epoch": 1.0661414866235084, "grad_norm": 1.8685895204544067, "learning_rate": 1.4907892523690432e-05, "loss": 0.8563, "step": 2562 }, { "epoch": 1.066557470816109, "grad_norm": 1.7722874879837036, "learning_rate": 1.4903988435161993e-05, "loss": 0.8319, "step": 2563 }, { "epoch": 1.0669734550087098, "grad_norm": 1.8962352275848389, "learning_rate": 1.4900083362247704e-05, "loss": 0.9381, "step": 2564 }, { "epoch": 1.0673894392013104, "grad_norm": 1.72947359085083, "learning_rate": 1.4896177305731443e-05, "loss": 0.8669, "step": 2565 }, { "epoch": 1.067805423393911, "grad_norm": 1.7919853925704956, "learning_rate": 1.4892270266397275e-05, "loss": 0.9099, "step": 2566 }, { "epoch": 1.0682214075865117, "grad_norm": 1.8401833772659302, "learning_rate": 1.4888362245029468e-05, "loss": 0.9524, "step": 2567 }, { "epoch": 1.0686373917791123, "grad_norm": 1.499634861946106, "learning_rate": 1.488445324241249e-05, "loss": 0.8585, "step": 2568 }, { "epoch": 1.0690533759717131, "grad_norm": 1.6110012531280518, "learning_rate": 1.4880543259330997e-05, "loss": 0.7975, "step": 2569 }, { "epoch": 1.0694693601643137, "grad_norm": 1.5604671239852905, "learning_rate": 1.4876632296569847e-05, "loss": 0.8653, "step": 2570 }, { "epoch": 1.0698853443569145, "grad_norm": 1.6278716325759888, "learning_rate": 1.4872720354914092e-05, "loss": 0.768, "step": 2571 }, { "epoch": 1.070301328549515, "grad_norm": 1.7043588161468506, "learning_rate": 1.4868807435148987e-05, "loss": 0.9073, "step": 2572 }, { "epoch": 1.070717312742116, "grad_norm": 1.564376950263977, "learning_rate": 1.4864893538059977e-05, "loss": 0.7739, "step": 2573 }, { "epoch": 1.0711332969347165, "grad_norm": 1.6073538064956665, "learning_rate": 1.4860978664432705e-05, "loss": 0.8203, "step": 2574 }, { "epoch": 1.071549281127317, "grad_norm": 1.762747883796692, "learning_rate": 1.485706281505301e-05, "loss": 0.9114, "step": 2575 }, { "epoch": 1.0719652653199179, "grad_norm": 1.6039901971817017, "learning_rate": 1.485314599070693e-05, "loss": 0.8357, "step": 2576 }, { "epoch": 1.0723812495125185, "grad_norm": 1.6566996574401855, "learning_rate": 1.4849228192180691e-05, "loss": 0.8085, "step": 2577 }, { "epoch": 1.0727972337051193, "grad_norm": 2.698636770248413, "learning_rate": 1.4845309420260723e-05, "loss": 0.8381, "step": 2578 }, { "epoch": 1.0732132178977198, "grad_norm": 1.679240107536316, "learning_rate": 1.4841389675733646e-05, "loss": 0.8039, "step": 2579 }, { "epoch": 1.0736292020903206, "grad_norm": 1.6372579336166382, "learning_rate": 1.4837468959386276e-05, "loss": 0.7814, "step": 2580 }, { "epoch": 1.0740451862829212, "grad_norm": 1.624830722808838, "learning_rate": 1.4833547272005628e-05, "loss": 0.865, "step": 2581 }, { "epoch": 1.074461170475522, "grad_norm": 1.7069406509399414, "learning_rate": 1.4829624614378906e-05, "loss": 0.8056, "step": 2582 }, { "epoch": 1.0748771546681226, "grad_norm": 1.5622813701629639, "learning_rate": 1.4825700987293514e-05, "loss": 0.8004, "step": 2583 }, { "epoch": 1.0752931388607232, "grad_norm": 8.500130653381348, "learning_rate": 1.4821776391537047e-05, "loss": 0.874, "step": 2584 }, { "epoch": 1.075709123053324, "grad_norm": 1.6136245727539062, "learning_rate": 1.4817850827897293e-05, "loss": 0.9151, "step": 2585 }, { "epoch": 1.0761251072459246, "grad_norm": 1.578760027885437, "learning_rate": 1.481392429716224e-05, "loss": 0.7993, "step": 2586 }, { "epoch": 1.0765410914385254, "grad_norm": 1.6575618982315063, "learning_rate": 1.4809996800120068e-05, "loss": 0.8345, "step": 2587 }, { "epoch": 1.076957075631126, "grad_norm": 1.7084529399871826, "learning_rate": 1.480606833755915e-05, "loss": 0.8409, "step": 2588 }, { "epoch": 1.0773730598237268, "grad_norm": 31.28946876525879, "learning_rate": 1.4802138910268049e-05, "loss": 0.8433, "step": 2589 }, { "epoch": 1.0777890440163274, "grad_norm": 1.45919668674469, "learning_rate": 1.4798208519035529e-05, "loss": 0.6872, "step": 2590 }, { "epoch": 1.0782050282089282, "grad_norm": 1.6579666137695312, "learning_rate": 1.4794277164650544e-05, "loss": 0.7289, "step": 2591 }, { "epoch": 1.0786210124015287, "grad_norm": 1.6975842714309692, "learning_rate": 1.4790344847902242e-05, "loss": 0.8407, "step": 2592 }, { "epoch": 1.0790369965941293, "grad_norm": 1.5754213333129883, "learning_rate": 1.4786411569579961e-05, "loss": 0.9023, "step": 2593 }, { "epoch": 1.0794529807867301, "grad_norm": 1.607442021369934, "learning_rate": 1.4782477330473237e-05, "loss": 0.8028, "step": 2594 }, { "epoch": 1.0798689649793307, "grad_norm": 1.819854497909546, "learning_rate": 1.4778542131371794e-05, "loss": 0.8325, "step": 2595 }, { "epoch": 1.0802849491719315, "grad_norm": 60.49199295043945, "learning_rate": 1.4774605973065552e-05, "loss": 0.9294, "step": 2596 }, { "epoch": 1.080700933364532, "grad_norm": 108.61138916015625, "learning_rate": 1.4770668856344626e-05, "loss": 0.9828, "step": 2597 }, { "epoch": 1.081116917557133, "grad_norm": 1.4990159273147583, "learning_rate": 1.4766730781999315e-05, "loss": 0.8349, "step": 2598 }, { "epoch": 1.0815329017497335, "grad_norm": 1.5534048080444336, "learning_rate": 1.4762791750820115e-05, "loss": 0.8352, "step": 2599 }, { "epoch": 1.0819488859423343, "grad_norm": 5.054703712463379, "learning_rate": 1.4758851763597721e-05, "loss": 0.8966, "step": 2600 }, { "epoch": 1.0823648701349349, "grad_norm": 1711.3074951171875, "learning_rate": 1.4754910821123008e-05, "loss": 0.8503, "step": 2601 }, { "epoch": 1.0827808543275355, "grad_norm": 1.6494371891021729, "learning_rate": 1.4750968924187046e-05, "loss": 0.8068, "step": 2602 }, { "epoch": 1.0831968385201363, "grad_norm": 10.787176132202148, "learning_rate": 1.4747026073581106e-05, "loss": 0.9215, "step": 2603 }, { "epoch": 1.0836128227127368, "grad_norm": 1.7287580966949463, "learning_rate": 1.4743082270096635e-05, "loss": 0.9607, "step": 2604 }, { "epoch": 1.0840288069053376, "grad_norm": 1.6236951351165771, "learning_rate": 1.4739137514525285e-05, "loss": 0.7907, "step": 2605 }, { "epoch": 1.0844447910979382, "grad_norm": 20.142000198364258, "learning_rate": 1.4735191807658891e-05, "loss": 0.8477, "step": 2606 }, { "epoch": 1.084860775290539, "grad_norm": 1.544407606124878, "learning_rate": 1.4731245150289483e-05, "loss": 0.8034, "step": 2607 }, { "epoch": 1.0852767594831396, "grad_norm": 1.8594095706939697, "learning_rate": 1.4727297543209279e-05, "loss": 0.8541, "step": 2608 }, { "epoch": 1.0856927436757404, "grad_norm": 1.5980029106140137, "learning_rate": 1.4723348987210688e-05, "loss": 0.8045, "step": 2609 }, { "epoch": 1.086108727868341, "grad_norm": 144.30789184570312, "learning_rate": 1.471939948308631e-05, "loss": 0.8051, "step": 2610 }, { "epoch": 1.0865247120609416, "grad_norm": 1.7856264114379883, "learning_rate": 1.4715449031628937e-05, "loss": 0.8777, "step": 2611 }, { "epoch": 1.0869406962535424, "grad_norm": 5.244564533233643, "learning_rate": 1.4711497633631553e-05, "loss": 1.0012, "step": 2612 }, { "epoch": 1.087356680446143, "grad_norm": 1.6062216758728027, "learning_rate": 1.4707545289887322e-05, "loss": 0.8079, "step": 2613 }, { "epoch": 1.0877726646387438, "grad_norm": 1.7825497388839722, "learning_rate": 1.4703592001189608e-05, "loss": 0.8106, "step": 2614 }, { "epoch": 1.0881886488313444, "grad_norm": 1.7281652688980103, "learning_rate": 1.469963776833196e-05, "loss": 0.849, "step": 2615 }, { "epoch": 1.0886046330239452, "grad_norm": 1.4952605962753296, "learning_rate": 1.469568259210812e-05, "loss": 0.7228, "step": 2616 }, { "epoch": 1.0890206172165457, "grad_norm": 1.8212599754333496, "learning_rate": 1.4691726473312017e-05, "loss": 0.8728, "step": 2617 }, { "epoch": 1.0894366014091466, "grad_norm": 2.849355697631836, "learning_rate": 1.4687769412737766e-05, "loss": 0.985, "step": 2618 }, { "epoch": 1.0898525856017471, "grad_norm": 1.586274266242981, "learning_rate": 1.4683811411179678e-05, "loss": 0.9307, "step": 2619 }, { "epoch": 1.0902685697943477, "grad_norm": 1.7150853872299194, "learning_rate": 1.467985246943225e-05, "loss": 0.9551, "step": 2620 }, { "epoch": 1.0906845539869485, "grad_norm": 1.6834183931350708, "learning_rate": 1.4675892588290167e-05, "loss": 0.7826, "step": 2621 }, { "epoch": 1.091100538179549, "grad_norm": 1.7958745956420898, "learning_rate": 1.4671931768548295e-05, "loss": 0.9482, "step": 2622 }, { "epoch": 1.09151652237215, "grad_norm": 3.291560173034668, "learning_rate": 1.4667970011001705e-05, "loss": 0.9149, "step": 2623 }, { "epoch": 1.0919325065647505, "grad_norm": 1.5080764293670654, "learning_rate": 1.4664007316445644e-05, "loss": 0.7667, "step": 2624 }, { "epoch": 1.0923484907573513, "grad_norm": 1.8026260137557983, "learning_rate": 1.466004368567555e-05, "loss": 0.9023, "step": 2625 }, { "epoch": 1.0927644749499519, "grad_norm": 1.6947944164276123, "learning_rate": 1.465607911948705e-05, "loss": 0.9215, "step": 2626 }, { "epoch": 1.0931804591425527, "grad_norm": 1.7048033475875854, "learning_rate": 1.465211361867596e-05, "loss": 0.8284, "step": 2627 }, { "epoch": 1.0935964433351533, "grad_norm": 1.7592378854751587, "learning_rate": 1.4648147184038277e-05, "loss": 0.8496, "step": 2628 }, { "epoch": 1.0940124275277539, "grad_norm": 1.702781081199646, "learning_rate": 1.4644179816370192e-05, "loss": 0.835, "step": 2629 }, { "epoch": 1.0944284117203547, "grad_norm": 1.7306500673294067, "learning_rate": 1.4640211516468083e-05, "loss": 0.9251, "step": 2630 }, { "epoch": 1.0948443959129552, "grad_norm": 1.584286093711853, "learning_rate": 1.4636242285128512e-05, "loss": 0.7819, "step": 2631 }, { "epoch": 1.095260380105556, "grad_norm": 1.572631597518921, "learning_rate": 1.463227212314823e-05, "loss": 0.8453, "step": 2632 }, { "epoch": 1.0956763642981566, "grad_norm": 12.777314186096191, "learning_rate": 1.4628301031324172e-05, "loss": 0.8298, "step": 2633 }, { "epoch": 1.0960923484907574, "grad_norm": 1.5308094024658203, "learning_rate": 1.4624329010453467e-05, "loss": 0.8555, "step": 2634 }, { "epoch": 1.096508332683358, "grad_norm": 1.792627215385437, "learning_rate": 1.4620356061333421e-05, "loss": 0.8839, "step": 2635 }, { "epoch": 1.0969243168759588, "grad_norm": 1.8575860261917114, "learning_rate": 1.4616382184761535e-05, "loss": 0.9938, "step": 2636 }, { "epoch": 1.0973403010685594, "grad_norm": 1.6344835758209229, "learning_rate": 1.4612407381535488e-05, "loss": 0.8024, "step": 2637 }, { "epoch": 1.09775628526116, "grad_norm": 1.6134051084518433, "learning_rate": 1.4608431652453147e-05, "loss": 0.7564, "step": 2638 }, { "epoch": 1.0981722694537608, "grad_norm": 1.7106907367706299, "learning_rate": 1.4604454998312574e-05, "loss": 0.8873, "step": 2639 }, { "epoch": 1.0985882536463614, "grad_norm": 1.7490379810333252, "learning_rate": 1.4600477419912002e-05, "loss": 0.9261, "step": 2640 }, { "epoch": 1.0990042378389622, "grad_norm": 1.6436771154403687, "learning_rate": 1.4596498918049866e-05, "loss": 0.8414, "step": 2641 }, { "epoch": 1.0994202220315628, "grad_norm": 1.7186774015426636, "learning_rate": 1.4592519493524768e-05, "loss": 0.8793, "step": 2642 }, { "epoch": 1.0998362062241636, "grad_norm": 1.6139906644821167, "learning_rate": 1.4588539147135507e-05, "loss": 0.7611, "step": 2643 }, { "epoch": 1.1002521904167641, "grad_norm": 1.7364238500595093, "learning_rate": 1.4584557879681066e-05, "loss": 0.8766, "step": 2644 }, { "epoch": 1.100668174609365, "grad_norm": 1.7345606088638306, "learning_rate": 1.4580575691960614e-05, "loss": 0.9228, "step": 2645 }, { "epoch": 1.1010841588019655, "grad_norm": 1.6952892541885376, "learning_rate": 1.4576592584773498e-05, "loss": 0.8811, "step": 2646 }, { "epoch": 1.1015001429945661, "grad_norm": 1.732276201248169, "learning_rate": 1.457260855891926e-05, "loss": 0.7504, "step": 2647 }, { "epoch": 1.101916127187167, "grad_norm": 1.863016963005066, "learning_rate": 1.4568623615197608e-05, "loss": 0.83, "step": 2648 }, { "epoch": 1.1023321113797675, "grad_norm": 1.6379657983779907, "learning_rate": 1.4564637754408457e-05, "loss": 0.7941, "step": 2649 }, { "epoch": 1.1027480955723683, "grad_norm": 1.5389512777328491, "learning_rate": 1.4560650977351896e-05, "loss": 0.9263, "step": 2650 }, { "epoch": 1.1031640797649689, "grad_norm": 2.8498919010162354, "learning_rate": 1.4556663284828193e-05, "loss": 0.7827, "step": 2651 }, { "epoch": 1.1035800639575697, "grad_norm": 1.7341127395629883, "learning_rate": 1.4552674677637804e-05, "loss": 0.8549, "step": 2652 }, { "epoch": 1.1039960481501703, "grad_norm": 1.6495503187179565, "learning_rate": 1.454868515658137e-05, "loss": 0.8217, "step": 2653 }, { "epoch": 1.104412032342771, "grad_norm": 1.8187296390533447, "learning_rate": 1.4544694722459719e-05, "loss": 0.8565, "step": 2654 }, { "epoch": 1.1048280165353717, "grad_norm": 1.6063423156738281, "learning_rate": 1.4540703376073846e-05, "loss": 0.9181, "step": 2655 }, { "epoch": 1.1052440007279722, "grad_norm": 1.5927648544311523, "learning_rate": 1.4536711118224953e-05, "loss": 0.7955, "step": 2656 }, { "epoch": 1.105659984920573, "grad_norm": 1.66209876537323, "learning_rate": 1.4532717949714408e-05, "loss": 0.8967, "step": 2657 }, { "epoch": 1.1060759691131736, "grad_norm": 1.6302975416183472, "learning_rate": 1.4528723871343762e-05, "loss": 0.8168, "step": 2658 }, { "epoch": 1.1064919533057744, "grad_norm": 10.607521057128906, "learning_rate": 1.4524728883914759e-05, "loss": 0.9111, "step": 2659 }, { "epoch": 1.106907937498375, "grad_norm": 1.6223437786102295, "learning_rate": 1.4520732988229315e-05, "loss": 0.9003, "step": 2660 }, { "epoch": 1.1073239216909758, "grad_norm": 1.666259765625, "learning_rate": 1.451673618508954e-05, "loss": 0.8073, "step": 2661 }, { "epoch": 1.1077399058835764, "grad_norm": 1.6728649139404297, "learning_rate": 1.4512738475297712e-05, "loss": 0.9128, "step": 2662 }, { "epoch": 1.1081558900761772, "grad_norm": 1.4451323747634888, "learning_rate": 1.4508739859656298e-05, "loss": 0.7586, "step": 2663 }, { "epoch": 1.1085718742687778, "grad_norm": 1.8400235176086426, "learning_rate": 1.450474033896795e-05, "loss": 1.0131, "step": 2664 }, { "epoch": 1.1089878584613784, "grad_norm": 92.07283020019531, "learning_rate": 1.4500739914035498e-05, "loss": 0.7883, "step": 2665 }, { "epoch": 1.1094038426539792, "grad_norm": 1.6714779138565063, "learning_rate": 1.4496738585661958e-05, "loss": 0.6888, "step": 2666 }, { "epoch": 1.1098198268465798, "grad_norm": 1.7845793962478638, "learning_rate": 1.4492736354650514e-05, "loss": 0.8809, "step": 2667 }, { "epoch": 1.1102358110391806, "grad_norm": 2.3454816341400146, "learning_rate": 1.4488733221804547e-05, "loss": 0.8706, "step": 2668 }, { "epoch": 1.1106517952317811, "grad_norm": 1.6345702409744263, "learning_rate": 1.4484729187927613e-05, "loss": 0.8827, "step": 2669 }, { "epoch": 1.111067779424382, "grad_norm": 1.7648953199386597, "learning_rate": 1.4480724253823444e-05, "loss": 0.9277, "step": 2670 }, { "epoch": 1.1114837636169825, "grad_norm": 1.7490525245666504, "learning_rate": 1.4476718420295964e-05, "loss": 0.8386, "step": 2671 }, { "epoch": 1.1118997478095833, "grad_norm": 1.741738200187683, "learning_rate": 1.4472711688149263e-05, "loss": 0.8845, "step": 2672 }, { "epoch": 1.112315732002184, "grad_norm": 1.6119780540466309, "learning_rate": 1.4468704058187623e-05, "loss": 0.8386, "step": 2673 }, { "epoch": 1.1127317161947845, "grad_norm": 1.6576718091964722, "learning_rate": 1.44646955312155e-05, "loss": 0.8307, "step": 2674 }, { "epoch": 1.1131477003873853, "grad_norm": 1.7025063037872314, "learning_rate": 1.4460686108037538e-05, "loss": 0.9189, "step": 2675 }, { "epoch": 1.113563684579986, "grad_norm": 1.5564121007919312, "learning_rate": 1.4456675789458552e-05, "loss": 0.8148, "step": 2676 }, { "epoch": 1.1139796687725867, "grad_norm": 1.6578872203826904, "learning_rate": 1.4452664576283537e-05, "loss": 0.7874, "step": 2677 }, { "epoch": 1.1143956529651873, "grad_norm": 20.605432510375977, "learning_rate": 1.4448652469317674e-05, "loss": 0.8869, "step": 2678 }, { "epoch": 1.114811637157788, "grad_norm": 1.7321701049804688, "learning_rate": 1.4444639469366317e-05, "loss": 0.8517, "step": 2679 }, { "epoch": 1.1152276213503887, "grad_norm": 1.6390845775604248, "learning_rate": 1.4440625577235009e-05, "loss": 0.856, "step": 2680 }, { "epoch": 1.1156436055429895, "grad_norm": 48.94026565551758, "learning_rate": 1.4436610793729458e-05, "loss": 0.8091, "step": 2681 }, { "epoch": 1.11605958973559, "grad_norm": 12.935559272766113, "learning_rate": 1.443259511965556e-05, "loss": 0.7972, "step": 2682 }, { "epoch": 1.1164755739281906, "grad_norm": 1.6585012674331665, "learning_rate": 1.4428578555819389e-05, "loss": 0.8196, "step": 2683 }, { "epoch": 1.1168915581207914, "grad_norm": 57.942840576171875, "learning_rate": 1.4424561103027195e-05, "loss": 0.8214, "step": 2684 }, { "epoch": 1.117307542313392, "grad_norm": 1.7303146123886108, "learning_rate": 1.4420542762085412e-05, "loss": 0.8385, "step": 2685 }, { "epoch": 1.1177235265059928, "grad_norm": 1.6568491458892822, "learning_rate": 1.4416523533800647e-05, "loss": 0.8317, "step": 2686 }, { "epoch": 1.1181395106985934, "grad_norm": 1.719585657119751, "learning_rate": 1.4412503418979683e-05, "loss": 0.8697, "step": 2687 }, { "epoch": 1.1185554948911942, "grad_norm": 1.7585082054138184, "learning_rate": 1.440848241842949e-05, "loss": 0.8664, "step": 2688 }, { "epoch": 1.1189714790837948, "grad_norm": 1.6452215909957886, "learning_rate": 1.4404460532957206e-05, "loss": 0.8076, "step": 2689 }, { "epoch": 1.1193874632763956, "grad_norm": 1.8238952159881592, "learning_rate": 1.440043776337015e-05, "loss": 0.8882, "step": 2690 }, { "epoch": 1.1198034474689962, "grad_norm": 1.554466724395752, "learning_rate": 1.4396414110475822e-05, "loss": 0.8078, "step": 2691 }, { "epoch": 1.1202194316615968, "grad_norm": 5.638359546661377, "learning_rate": 1.4392389575081896e-05, "loss": 0.8038, "step": 2692 }, { "epoch": 1.1206354158541976, "grad_norm": 1.6608446836471558, "learning_rate": 1.4388364157996221e-05, "loss": 0.7811, "step": 2693 }, { "epoch": 1.1210514000467982, "grad_norm": 2.545034170150757, "learning_rate": 1.4384337860026829e-05, "loss": 0.9237, "step": 2694 }, { "epoch": 1.121467384239399, "grad_norm": 1.6692205667495728, "learning_rate": 1.4380310681981927e-05, "loss": 0.7768, "step": 2695 }, { "epoch": 1.1218833684319995, "grad_norm": 1.6102737188339233, "learning_rate": 1.4376282624669892e-05, "loss": 0.8214, "step": 2696 }, { "epoch": 1.1222993526246003, "grad_norm": 1.5538593530654907, "learning_rate": 1.437225368889929e-05, "loss": 0.8062, "step": 2697 }, { "epoch": 1.122715336817201, "grad_norm": 1.8063111305236816, "learning_rate": 1.4368223875478848e-05, "loss": 0.9165, "step": 2698 }, { "epoch": 1.1231313210098017, "grad_norm": 1.6730672121047974, "learning_rate": 1.4364193185217484e-05, "loss": 0.7692, "step": 2699 }, { "epoch": 1.1235473052024023, "grad_norm": 1.6571714878082275, "learning_rate": 1.4360161618924283e-05, "loss": 0.8571, "step": 2700 }, { "epoch": 1.123963289395003, "grad_norm": 1.687447190284729, "learning_rate": 1.4356129177408504e-05, "loss": 0.9276, "step": 2701 }, { "epoch": 1.1243792735876037, "grad_norm": 1.752445936203003, "learning_rate": 1.435209586147959e-05, "loss": 0.9384, "step": 2702 }, { "epoch": 1.1247952577802043, "grad_norm": 1.7418359518051147, "learning_rate": 1.4348061671947153e-05, "loss": 0.934, "step": 2703 }, { "epoch": 1.125211241972805, "grad_norm": 1.5978144407272339, "learning_rate": 1.4344026609620984e-05, "loss": 0.8446, "step": 2704 }, { "epoch": 1.1256272261654057, "grad_norm": 1.6974371671676636, "learning_rate": 1.433999067531105e-05, "loss": 0.8814, "step": 2705 }, { "epoch": 1.1260432103580065, "grad_norm": 1.5683274269104004, "learning_rate": 1.4335953869827491e-05, "loss": 0.8458, "step": 2706 }, { "epoch": 1.126459194550607, "grad_norm": 1.5667812824249268, "learning_rate": 1.4331916193980617e-05, "loss": 0.8535, "step": 2707 }, { "epoch": 1.1268751787432079, "grad_norm": 1.718801498413086, "learning_rate": 1.432787764858092e-05, "loss": 0.782, "step": 2708 }, { "epoch": 1.1272911629358084, "grad_norm": 1.6518663167953491, "learning_rate": 1.432383823443906e-05, "loss": 0.8306, "step": 2709 }, { "epoch": 1.127707147128409, "grad_norm": 1.7114933729171753, "learning_rate": 1.4319797952365883e-05, "loss": 0.9756, "step": 2710 }, { "epoch": 1.1281231313210098, "grad_norm": 1.5203073024749756, "learning_rate": 1.4315756803172401e-05, "loss": 0.8677, "step": 2711 }, { "epoch": 1.1285391155136104, "grad_norm": 1.6349472999572754, "learning_rate": 1.4311714787669792e-05, "loss": 0.8586, "step": 2712 }, { "epoch": 1.1289550997062112, "grad_norm": 1.5353738069534302, "learning_rate": 1.4307671906669428e-05, "loss": 0.8703, "step": 2713 }, { "epoch": 1.1293710838988118, "grad_norm": 1.6669018268585205, "learning_rate": 1.4303628160982831e-05, "loss": 0.8873, "step": 2714 }, { "epoch": 1.1297870680914126, "grad_norm": 1.5931185483932495, "learning_rate": 1.4299583551421721e-05, "loss": 0.7979, "step": 2715 }, { "epoch": 1.1302030522840132, "grad_norm": 1.6815392971038818, "learning_rate": 1.4295538078797972e-05, "loss": 0.812, "step": 2716 }, { "epoch": 1.130619036476614, "grad_norm": 1.7972608804702759, "learning_rate": 1.429149174392364e-05, "loss": 0.8478, "step": 2717 }, { "epoch": 1.1310350206692146, "grad_norm": 1.6328589916229248, "learning_rate": 1.4287444547610951e-05, "loss": 0.8174, "step": 2718 }, { "epoch": 1.1314510048618152, "grad_norm": 1.624837875366211, "learning_rate": 1.428339649067231e-05, "loss": 0.8324, "step": 2719 }, { "epoch": 1.131866989054416, "grad_norm": 1.830607533454895, "learning_rate": 1.4279347573920288e-05, "loss": 0.9579, "step": 2720 }, { "epoch": 1.1322829732470165, "grad_norm": 1.6083581447601318, "learning_rate": 1.4275297798167629e-05, "loss": 0.9243, "step": 2721 }, { "epoch": 1.1326989574396173, "grad_norm": 1.52564537525177, "learning_rate": 1.4271247164227252e-05, "loss": 0.8058, "step": 2722 }, { "epoch": 1.133114941632218, "grad_norm": 1.4757404327392578, "learning_rate": 1.426719567291225e-05, "loss": 0.7639, "step": 2723 }, { "epoch": 1.1335309258248187, "grad_norm": 117.25328826904297, "learning_rate": 1.426314332503588e-05, "loss": 0.8959, "step": 2724 }, { "epoch": 1.1339469100174193, "grad_norm": 32.059967041015625, "learning_rate": 1.4259090121411585e-05, "loss": 0.7906, "step": 2725 }, { "epoch": 1.1343628942100201, "grad_norm": 1.7791491746902466, "learning_rate": 1.4255036062852967e-05, "loss": 0.9756, "step": 2726 }, { "epoch": 1.1347788784026207, "grad_norm": 1.690891981124878, "learning_rate": 1.4250981150173801e-05, "loss": 0.8373, "step": 2727 }, { "epoch": 1.1351948625952213, "grad_norm": 1.6656142473220825, "learning_rate": 1.4246925384188042e-05, "loss": 0.8156, "step": 2728 }, { "epoch": 1.135610846787822, "grad_norm": 1.670377492904663, "learning_rate": 1.4242868765709808e-05, "loss": 0.8494, "step": 2729 }, { "epoch": 1.1360268309804227, "grad_norm": 1.6528170108795166, "learning_rate": 1.4238811295553392e-05, "loss": 0.8574, "step": 2730 }, { "epoch": 1.1364428151730235, "grad_norm": 1.5044008493423462, "learning_rate": 1.4234752974533253e-05, "loss": 0.9529, "step": 2731 }, { "epoch": 1.136858799365624, "grad_norm": 14.728015899658203, "learning_rate": 1.4230693803464029e-05, "loss": 0.8263, "step": 2732 }, { "epoch": 1.1372747835582249, "grad_norm": 1.6580393314361572, "learning_rate": 1.4226633783160521e-05, "loss": 0.793, "step": 2733 }, { "epoch": 1.1376907677508254, "grad_norm": 2.8563809394836426, "learning_rate": 1.4222572914437707e-05, "loss": 0.815, "step": 2734 }, { "epoch": 1.1381067519434263, "grad_norm": 1.6719673871994019, "learning_rate": 1.4218511198110731e-05, "loss": 0.8571, "step": 2735 }, { "epoch": 1.1385227361360268, "grad_norm": 1.7770684957504272, "learning_rate": 1.4214448634994909e-05, "loss": 0.8277, "step": 2736 }, { "epoch": 1.1389387203286274, "grad_norm": 1.7295989990234375, "learning_rate": 1.4210385225905721e-05, "loss": 0.9447, "step": 2737 }, { "epoch": 1.1393547045212282, "grad_norm": 1.7369019985198975, "learning_rate": 1.4206320971658825e-05, "loss": 0.742, "step": 2738 }, { "epoch": 1.1397706887138288, "grad_norm": 10.385313987731934, "learning_rate": 1.4202255873070047e-05, "loss": 0.9166, "step": 2739 }, { "epoch": 1.1401866729064296, "grad_norm": 1.5887597799301147, "learning_rate": 1.4198189930955382e-05, "loss": 0.6366, "step": 2740 }, { "epoch": 1.1406026570990302, "grad_norm": 1.7081592082977295, "learning_rate": 1.4194123146130991e-05, "loss": 0.8655, "step": 2741 }, { "epoch": 1.141018641291631, "grad_norm": 1.6142548322677612, "learning_rate": 1.4190055519413206e-05, "loss": 0.7743, "step": 2742 }, { "epoch": 1.1414346254842316, "grad_norm": 1.7412456274032593, "learning_rate": 1.4185987051618532e-05, "loss": 0.8331, "step": 2743 }, { "epoch": 1.1418506096768324, "grad_norm": 1.5194073915481567, "learning_rate": 1.4181917743563636e-05, "loss": 0.7112, "step": 2744 }, { "epoch": 1.142266593869433, "grad_norm": 1.5827864408493042, "learning_rate": 1.4177847596065359e-05, "loss": 0.7704, "step": 2745 }, { "epoch": 1.1426825780620335, "grad_norm": 1.656746745109558, "learning_rate": 1.4173776609940708e-05, "loss": 0.7852, "step": 2746 }, { "epoch": 1.1430985622546344, "grad_norm": 1.6471396684646606, "learning_rate": 1.4169704786006858e-05, "loss": 0.8005, "step": 2747 }, { "epoch": 1.143514546447235, "grad_norm": 1.5610885620117188, "learning_rate": 1.4165632125081152e-05, "loss": 0.8975, "step": 2748 }, { "epoch": 1.1439305306398357, "grad_norm": 1.7009873390197754, "learning_rate": 1.4161558627981105e-05, "loss": 0.8323, "step": 2749 }, { "epoch": 1.1443465148324363, "grad_norm": 1.7356758117675781, "learning_rate": 1.4157484295524397e-05, "loss": 0.8671, "step": 2750 }, { "epoch": 1.1447624990250371, "grad_norm": 1.7369346618652344, "learning_rate": 1.4153409128528873e-05, "loss": 0.8852, "step": 2751 }, { "epoch": 1.1451784832176377, "grad_norm": 1.6424332857131958, "learning_rate": 1.4149333127812551e-05, "loss": 0.8679, "step": 2752 }, { "epoch": 1.1455944674102385, "grad_norm": 1.867295503616333, "learning_rate": 1.4145256294193607e-05, "loss": 0.9889, "step": 2753 }, { "epoch": 1.146010451602839, "grad_norm": 1.6192121505737305, "learning_rate": 1.41411786284904e-05, "loss": 0.7482, "step": 2754 }, { "epoch": 1.1464264357954397, "grad_norm": 1.5888522863388062, "learning_rate": 1.4137100131521442e-05, "loss": 0.8911, "step": 2755 }, { "epoch": 1.1468424199880405, "grad_norm": 1.5949699878692627, "learning_rate": 1.4133020804105415e-05, "loss": 0.8687, "step": 2756 }, { "epoch": 1.147258404180641, "grad_norm": 1.59811270236969, "learning_rate": 1.4128940647061171e-05, "loss": 0.8656, "step": 2757 }, { "epoch": 1.1476743883732419, "grad_norm": 87.1365737915039, "learning_rate": 1.412485966120773e-05, "loss": 0.7949, "step": 2758 }, { "epoch": 1.1480903725658425, "grad_norm": 1.8842108249664307, "learning_rate": 1.412077784736427e-05, "loss": 0.8742, "step": 2759 }, { "epoch": 1.1485063567584433, "grad_norm": 1.5907995700836182, "learning_rate": 1.4116695206350144e-05, "loss": 0.8025, "step": 2760 }, { "epoch": 1.1489223409510438, "grad_norm": 1.8182913064956665, "learning_rate": 1.4112611738984865e-05, "loss": 0.8259, "step": 2761 }, { "epoch": 1.1493383251436446, "grad_norm": 1.8183499574661255, "learning_rate": 1.4108527446088114e-05, "loss": 0.8787, "step": 2762 }, { "epoch": 1.1497543093362452, "grad_norm": 1.7098747491836548, "learning_rate": 1.4104442328479738e-05, "loss": 0.8678, "step": 2763 }, { "epoch": 1.1501702935288458, "grad_norm": 1.8179702758789062, "learning_rate": 1.4100356386979753e-05, "loss": 0.9107, "step": 2764 }, { "epoch": 1.1505862777214466, "grad_norm": 1.563295602798462, "learning_rate": 1.4096269622408334e-05, "loss": 0.7555, "step": 2765 }, { "epoch": 1.1510022619140472, "grad_norm": 1.8003618717193604, "learning_rate": 1.4092182035585824e-05, "loss": 0.9138, "step": 2766 }, { "epoch": 1.151418246106648, "grad_norm": 1.704463005065918, "learning_rate": 1.408809362733273e-05, "loss": 0.802, "step": 2767 }, { "epoch": 1.1518342302992486, "grad_norm": 1.4736368656158447, "learning_rate": 1.408400439846973e-05, "loss": 0.7703, "step": 2768 }, { "epoch": 1.1522502144918494, "grad_norm": 1.8214670419692993, "learning_rate": 1.4079914349817655e-05, "loss": 0.7586, "step": 2769 }, { "epoch": 1.15266619868445, "grad_norm": 1.6620463132858276, "learning_rate": 1.4075823482197515e-05, "loss": 0.8639, "step": 2770 }, { "epoch": 1.1530821828770508, "grad_norm": 1.6150007247924805, "learning_rate": 1.407173179643047e-05, "loss": 0.8612, "step": 2771 }, { "epoch": 1.1534981670696514, "grad_norm": 1.577540636062622, "learning_rate": 1.4067639293337854e-05, "loss": 0.8447, "step": 2772 }, { "epoch": 1.153914151262252, "grad_norm": 1.5922586917877197, "learning_rate": 1.4063545973741165e-05, "loss": 0.7342, "step": 2773 }, { "epoch": 1.1543301354548527, "grad_norm": 1.688232421875, "learning_rate": 1.4059451838462055e-05, "loss": 0.8052, "step": 2774 }, { "epoch": 1.1547461196474533, "grad_norm": 1.5799343585968018, "learning_rate": 1.4055356888322352e-05, "loss": 0.8441, "step": 2775 }, { "epoch": 1.1551621038400541, "grad_norm": 1.7561861276626587, "learning_rate": 1.4051261124144038e-05, "loss": 0.8562, "step": 2776 }, { "epoch": 1.1555780880326547, "grad_norm": 1.6392779350280762, "learning_rate": 1.4047164546749267e-05, "loss": 0.8634, "step": 2777 }, { "epoch": 1.1559940722252555, "grad_norm": 1.5893620252609253, "learning_rate": 1.4043067156960351e-05, "loss": 0.9017, "step": 2778 }, { "epoch": 1.156410056417856, "grad_norm": 1.6695663928985596, "learning_rate": 1.4038968955599765e-05, "loss": 0.8864, "step": 2779 }, { "epoch": 1.156826040610457, "grad_norm": 1.5459214448928833, "learning_rate": 1.4034869943490146e-05, "loss": 0.6819, "step": 2780 }, { "epoch": 1.1572420248030575, "grad_norm": 8.427449226379395, "learning_rate": 1.4030770121454298e-05, "loss": 0.7962, "step": 2781 }, { "epoch": 1.157658008995658, "grad_norm": 1.6382755041122437, "learning_rate": 1.4026669490315183e-05, "loss": 0.8037, "step": 2782 }, { "epoch": 1.1580739931882589, "grad_norm": 1.8246296644210815, "learning_rate": 1.4022568050895932e-05, "loss": 0.9366, "step": 2783 }, { "epoch": 1.1584899773808595, "grad_norm": 1.6299049854278564, "learning_rate": 1.4018465804019829e-05, "loss": 0.9401, "step": 2784 }, { "epoch": 1.1589059615734603, "grad_norm": 1.6958708763122559, "learning_rate": 1.4014362750510328e-05, "loss": 0.8834, "step": 2785 }, { "epoch": 1.1593219457660608, "grad_norm": 1.9484713077545166, "learning_rate": 1.401025889119104e-05, "loss": 0.9028, "step": 2786 }, { "epoch": 1.1597379299586617, "grad_norm": 1.6305757761001587, "learning_rate": 1.4006154226885743e-05, "loss": 0.8096, "step": 2787 }, { "epoch": 1.1601539141512622, "grad_norm": 1.623530387878418, "learning_rate": 1.4002048758418365e-05, "loss": 0.8554, "step": 2788 }, { "epoch": 1.160569898343863, "grad_norm": 1.5487409830093384, "learning_rate": 1.3997942486613013e-05, "loss": 0.8407, "step": 2789 }, { "epoch": 1.1609858825364636, "grad_norm": 1.7091922760009766, "learning_rate": 1.3993835412293943e-05, "loss": 0.9311, "step": 2790 }, { "epoch": 1.1614018667290642, "grad_norm": 1.5242406129837036, "learning_rate": 1.3989727536285571e-05, "loss": 0.7276, "step": 2791 }, { "epoch": 1.161817850921665, "grad_norm": 1.5281996726989746, "learning_rate": 1.3985618859412484e-05, "loss": 0.9054, "step": 2792 }, { "epoch": 1.1622338351142656, "grad_norm": 1.6928189992904663, "learning_rate": 1.3981509382499416e-05, "loss": 0.9285, "step": 2793 }, { "epoch": 1.1626498193068664, "grad_norm": 1.5737501382827759, "learning_rate": 1.3977399106371278e-05, "loss": 0.9744, "step": 2794 }, { "epoch": 1.163065803499467, "grad_norm": 1.7950818538665771, "learning_rate": 1.3973288031853127e-05, "loss": 0.9056, "step": 2795 }, { "epoch": 1.1634817876920678, "grad_norm": 1.5872656106948853, "learning_rate": 1.3969176159770186e-05, "loss": 0.7987, "step": 2796 }, { "epoch": 1.1638977718846684, "grad_norm": 1.6946640014648438, "learning_rate": 1.3965063490947838e-05, "loss": 0.9724, "step": 2797 }, { "epoch": 1.1643137560772692, "grad_norm": 1.571424961090088, "learning_rate": 1.3960950026211628e-05, "loss": 0.8422, "step": 2798 }, { "epoch": 1.1647297402698698, "grad_norm": 26.759065628051758, "learning_rate": 1.395683576638726e-05, "loss": 0.7968, "step": 2799 }, { "epoch": 1.1651457244624703, "grad_norm": 1.5947805643081665, "learning_rate": 1.395272071230059e-05, "loss": 0.8315, "step": 2800 }, { "epoch": 1.1655617086550711, "grad_norm": 1.6260650157928467, "learning_rate": 1.3948604864777647e-05, "loss": 0.9189, "step": 2801 }, { "epoch": 1.1659776928476717, "grad_norm": 5.474784851074219, "learning_rate": 1.3944488224644605e-05, "loss": 0.8508, "step": 2802 }, { "epoch": 1.1663936770402725, "grad_norm": 1.6754562854766846, "learning_rate": 1.394037079272781e-05, "loss": 0.8347, "step": 2803 }, { "epoch": 1.166809661232873, "grad_norm": 1.5753169059753418, "learning_rate": 1.3936252569853761e-05, "loss": 0.8794, "step": 2804 }, { "epoch": 1.167225645425474, "grad_norm": 1.8333852291107178, "learning_rate": 1.3932133556849113e-05, "loss": 0.8626, "step": 2805 }, { "epoch": 1.1676416296180745, "grad_norm": 1.6632939577102661, "learning_rate": 1.3928013754540681e-05, "loss": 0.8553, "step": 2806 }, { "epoch": 1.1680576138106753, "grad_norm": 1.6202741861343384, "learning_rate": 1.3923893163755441e-05, "loss": 0.8444, "step": 2807 }, { "epoch": 1.1684735980032759, "grad_norm": 1.7495925426483154, "learning_rate": 1.391977178532053e-05, "loss": 0.8154, "step": 2808 }, { "epoch": 1.1688895821958765, "grad_norm": 1.552993893623352, "learning_rate": 1.3915649620063239e-05, "loss": 0.8255, "step": 2809 }, { "epoch": 1.1693055663884773, "grad_norm": 1.5895016193389893, "learning_rate": 1.3911526668811008e-05, "loss": 0.8057, "step": 2810 }, { "epoch": 1.1697215505810779, "grad_norm": 1.6121792793273926, "learning_rate": 1.3907402932391455e-05, "loss": 0.7784, "step": 2811 }, { "epoch": 1.1701375347736787, "grad_norm": 1.6264269351959229, "learning_rate": 1.3903278411632335e-05, "loss": 0.8241, "step": 2812 }, { "epoch": 1.1705535189662792, "grad_norm": 1.5008364915847778, "learning_rate": 1.3899153107361579e-05, "loss": 0.7797, "step": 2813 }, { "epoch": 1.17096950315888, "grad_norm": 1.7874938249588013, "learning_rate": 1.3895027020407264e-05, "loss": 0.7812, "step": 2814 }, { "epoch": 1.1713854873514806, "grad_norm": 1.538192629814148, "learning_rate": 1.389090015159762e-05, "loss": 0.9268, "step": 2815 }, { "epoch": 1.1718014715440814, "grad_norm": 58.915809631347656, "learning_rate": 1.3886772501761043e-05, "loss": 0.9534, "step": 2816 }, { "epoch": 1.172217455736682, "grad_norm": 1.5954521894454956, "learning_rate": 1.388264407172609e-05, "loss": 0.8029, "step": 2817 }, { "epoch": 1.1726334399292826, "grad_norm": 1.5859086513519287, "learning_rate": 1.3878514862321461e-05, "loss": 0.7875, "step": 2818 }, { "epoch": 1.1730494241218834, "grad_norm": 1.55638587474823, "learning_rate": 1.3874384874376021e-05, "loss": 0.8628, "step": 2819 }, { "epoch": 1.173465408314484, "grad_norm": 1.8596141338348389, "learning_rate": 1.3870254108718789e-05, "loss": 0.933, "step": 2820 }, { "epoch": 1.1738813925070848, "grad_norm": 1.6057188510894775, "learning_rate": 1.386612256617894e-05, "loss": 0.8451, "step": 2821 }, { "epoch": 1.1742973766996854, "grad_norm": 1.7613409757614136, "learning_rate": 1.3861990247585807e-05, "loss": 0.951, "step": 2822 }, { "epoch": 1.1747133608922862, "grad_norm": 8.07351303100586, "learning_rate": 1.3857857153768876e-05, "loss": 0.7981, "step": 2823 }, { "epoch": 1.1751293450848868, "grad_norm": 1.694061279296875, "learning_rate": 1.3853723285557794e-05, "loss": 0.7747, "step": 2824 }, { "epoch": 1.1755453292774876, "grad_norm": 1.7075519561767578, "learning_rate": 1.3849588643782353e-05, "loss": 0.8613, "step": 2825 }, { "epoch": 1.1759613134700881, "grad_norm": 1.7919321060180664, "learning_rate": 1.3845453229272508e-05, "loss": 0.944, "step": 2826 }, { "epoch": 1.1763772976626887, "grad_norm": 1.6842796802520752, "learning_rate": 1.3841317042858369e-05, "loss": 0.8096, "step": 2827 }, { "epoch": 1.1767932818552895, "grad_norm": 5.930259704589844, "learning_rate": 1.3837180085370202e-05, "loss": 0.8041, "step": 2828 }, { "epoch": 1.1772092660478901, "grad_norm": 1.63728928565979, "learning_rate": 1.3833042357638423e-05, "loss": 0.8907, "step": 2829 }, { "epoch": 1.177625250240491, "grad_norm": 1.7318155765533447, "learning_rate": 1.3828903860493602e-05, "loss": 0.7839, "step": 2830 }, { "epoch": 1.1780412344330915, "grad_norm": 1.687209963798523, "learning_rate": 1.3824764594766471e-05, "loss": 0.7851, "step": 2831 }, { "epoch": 1.1784572186256923, "grad_norm": 1.6159875392913818, "learning_rate": 1.3820624561287909e-05, "loss": 0.842, "step": 2832 }, { "epoch": 1.1788732028182929, "grad_norm": 1.8385076522827148, "learning_rate": 1.3816483760888959e-05, "loss": 0.937, "step": 2833 }, { "epoch": 1.1792891870108937, "grad_norm": 1.6935325860977173, "learning_rate": 1.38123421944008e-05, "loss": 0.93, "step": 2834 }, { "epoch": 1.1797051712034943, "grad_norm": 1.595998764038086, "learning_rate": 1.3808199862654783e-05, "loss": 0.8625, "step": 2835 }, { "epoch": 1.1801211553960949, "grad_norm": 1.8476402759552002, "learning_rate": 1.3804056766482401e-05, "loss": 0.9732, "step": 2836 }, { "epoch": 1.1805371395886957, "grad_norm": 1.5124266147613525, "learning_rate": 1.3799912906715306e-05, "loss": 0.8246, "step": 2837 }, { "epoch": 1.1809531237812962, "grad_norm": 1.5082249641418457, "learning_rate": 1.3795768284185307e-05, "loss": 0.7341, "step": 2838 }, { "epoch": 1.181369107973897, "grad_norm": 1.646418809890747, "learning_rate": 1.3791622899724355e-05, "loss": 0.8967, "step": 2839 }, { "epoch": 1.1817850921664976, "grad_norm": 1.7181941270828247, "learning_rate": 1.3787476754164559e-05, "loss": 0.889, "step": 2840 }, { "epoch": 1.1822010763590984, "grad_norm": 1.5523617267608643, "learning_rate": 1.3783329848338184e-05, "loss": 0.8591, "step": 2841 }, { "epoch": 1.182617060551699, "grad_norm": 1.5891300439834595, "learning_rate": 1.3779182183077648e-05, "loss": 0.8034, "step": 2842 }, { "epoch": 1.1830330447442998, "grad_norm": 1.600756049156189, "learning_rate": 1.3775033759215517e-05, "loss": 0.9117, "step": 2843 }, { "epoch": 1.1834490289369004, "grad_norm": 1.4745056629180908, "learning_rate": 1.3770884577584511e-05, "loss": 0.891, "step": 2844 }, { "epoch": 1.183865013129501, "grad_norm": 1.772783875465393, "learning_rate": 1.3766734639017501e-05, "loss": 0.9496, "step": 2845 }, { "epoch": 1.1842809973221018, "grad_norm": 1.6462994813919067, "learning_rate": 1.376258394434751e-05, "loss": 0.8964, "step": 2846 }, { "epoch": 1.1846969815147024, "grad_norm": 1.706727385520935, "learning_rate": 1.3758432494407717e-05, "loss": 0.8832, "step": 2847 }, { "epoch": 1.1851129657073032, "grad_norm": 1.6461470127105713, "learning_rate": 1.3754280290031452e-05, "loss": 0.7213, "step": 2848 }, { "epoch": 1.1855289498999038, "grad_norm": 1.741686463356018, "learning_rate": 1.3750127332052185e-05, "loss": 0.841, "step": 2849 }, { "epoch": 1.1859449340925046, "grad_norm": 6.249007701873779, "learning_rate": 1.3745973621303554e-05, "loss": 0.9461, "step": 2850 }, { "epoch": 1.1863609182851051, "grad_norm": 1.8511264324188232, "learning_rate": 1.3741819158619338e-05, "loss": 0.8627, "step": 2851 }, { "epoch": 1.186776902477706, "grad_norm": 1.6933108568191528, "learning_rate": 1.373766394483347e-05, "loss": 0.855, "step": 2852 }, { "epoch": 1.1871928866703065, "grad_norm": 1.6623090505599976, "learning_rate": 1.3733507980780035e-05, "loss": 0.821, "step": 2853 }, { "epoch": 1.1876088708629071, "grad_norm": 1.7148455381393433, "learning_rate": 1.3729351267293261e-05, "loss": 0.8883, "step": 2854 }, { "epoch": 1.188024855055508, "grad_norm": 1.7106667757034302, "learning_rate": 1.3725193805207532e-05, "loss": 0.8825, "step": 2855 }, { "epoch": 1.1884408392481085, "grad_norm": 1.5844672918319702, "learning_rate": 1.372103559535739e-05, "loss": 0.8658, "step": 2856 }, { "epoch": 1.1888568234407093, "grad_norm": 1.5434000492095947, "learning_rate": 1.3716876638577512e-05, "loss": 0.7974, "step": 2857 }, { "epoch": 1.18927280763331, "grad_norm": 1.6961442232131958, "learning_rate": 1.3712716935702738e-05, "loss": 0.8513, "step": 2858 }, { "epoch": 1.1896887918259107, "grad_norm": 1.7704439163208008, "learning_rate": 1.370855648756805e-05, "loss": 0.9126, "step": 2859 }, { "epoch": 1.1901047760185113, "grad_norm": 1.7407279014587402, "learning_rate": 1.3704395295008579e-05, "loss": 0.8582, "step": 2860 }, { "epoch": 1.190520760211112, "grad_norm": 1.625929355621338, "learning_rate": 1.3700233358859608e-05, "loss": 0.7831, "step": 2861 }, { "epoch": 1.1909367444037127, "grad_norm": 1.6496704816818237, "learning_rate": 1.3696070679956575e-05, "loss": 0.8668, "step": 2862 }, { "epoch": 1.1913527285963132, "grad_norm": 1.7029385566711426, "learning_rate": 1.369190725913506e-05, "loss": 0.9059, "step": 2863 }, { "epoch": 1.191768712788914, "grad_norm": 1.7114205360412598, "learning_rate": 1.368774309723079e-05, "loss": 0.7803, "step": 2864 }, { "epoch": 1.1921846969815146, "grad_norm": 1.6262534856796265, "learning_rate": 1.3683578195079643e-05, "loss": 0.8308, "step": 2865 }, { "epoch": 1.1926006811741154, "grad_norm": 1.64303719997406, "learning_rate": 1.3679412553517653e-05, "loss": 0.7936, "step": 2866 }, { "epoch": 1.193016665366716, "grad_norm": 1.4486209154129028, "learning_rate": 1.3675246173380993e-05, "loss": 0.7688, "step": 2867 }, { "epoch": 1.1934326495593168, "grad_norm": 1.5673326253890991, "learning_rate": 1.3671079055505989e-05, "loss": 0.7741, "step": 2868 }, { "epoch": 1.1938486337519174, "grad_norm": 1.5843335390090942, "learning_rate": 1.3666911200729112e-05, "loss": 0.8635, "step": 2869 }, { "epoch": 1.1942646179445182, "grad_norm": 1.639880657196045, "learning_rate": 1.366274260988698e-05, "loss": 0.881, "step": 2870 }, { "epoch": 1.1946806021371188, "grad_norm": 1.7203341722488403, "learning_rate": 1.3658573283816366e-05, "loss": 0.8492, "step": 2871 }, { "epoch": 1.1950965863297194, "grad_norm": 1.7500367164611816, "learning_rate": 1.3654403223354183e-05, "loss": 0.8097, "step": 2872 }, { "epoch": 1.1955125705223202, "grad_norm": 1.6203688383102417, "learning_rate": 1.3650232429337495e-05, "loss": 0.939, "step": 2873 }, { "epoch": 1.1959285547149208, "grad_norm": 1.6009745597839355, "learning_rate": 1.3646060902603513e-05, "loss": 0.7882, "step": 2874 }, { "epoch": 1.1963445389075216, "grad_norm": 1.6429561376571655, "learning_rate": 1.3641888643989593e-05, "loss": 0.8784, "step": 2875 }, { "epoch": 1.1967605231001222, "grad_norm": 3.034383535385132, "learning_rate": 1.363771565433324e-05, "loss": 0.8134, "step": 2876 }, { "epoch": 1.197176507292723, "grad_norm": 1.7598131895065308, "learning_rate": 1.3633541934472107e-05, "loss": 0.9087, "step": 2877 }, { "epoch": 1.1975924914853235, "grad_norm": 1.6827619075775146, "learning_rate": 1.362936748524399e-05, "loss": 0.8242, "step": 2878 }, { "epoch": 1.1980084756779243, "grad_norm": 1.8234126567840576, "learning_rate": 1.3625192307486834e-05, "loss": 0.88, "step": 2879 }, { "epoch": 1.198424459870525, "grad_norm": 1.617067813873291, "learning_rate": 1.3621016402038731e-05, "loss": 0.8558, "step": 2880 }, { "epoch": 1.1988404440631255, "grad_norm": 1.5826221704483032, "learning_rate": 1.3616839769737913e-05, "loss": 0.9022, "step": 2881 }, { "epoch": 1.1992564282557263, "grad_norm": 1.6441030502319336, "learning_rate": 1.3612662411422766e-05, "loss": 0.7804, "step": 2882 }, { "epoch": 1.199672412448327, "grad_norm": 1.528724193572998, "learning_rate": 1.360848432793182e-05, "loss": 0.8528, "step": 2883 }, { "epoch": 1.2000883966409277, "grad_norm": 1.7836673259735107, "learning_rate": 1.3604305520103744e-05, "loss": 0.8801, "step": 2884 }, { "epoch": 1.2005043808335283, "grad_norm": 1.5782488584518433, "learning_rate": 1.3600125988777359e-05, "loss": 0.8213, "step": 2885 }, { "epoch": 1.200920365026129, "grad_norm": 1.6914114952087402, "learning_rate": 1.3595945734791632e-05, "loss": 0.7904, "step": 2886 }, { "epoch": 1.2013363492187297, "grad_norm": 1.626139521598816, "learning_rate": 1.3591764758985665e-05, "loss": 0.7495, "step": 2887 }, { "epoch": 1.2017523334113305, "grad_norm": 1.6206408739089966, "learning_rate": 1.3587583062198724e-05, "loss": 0.7969, "step": 2888 }, { "epoch": 1.202168317603931, "grad_norm": 1.7512259483337402, "learning_rate": 1.3583400645270197e-05, "loss": 0.9491, "step": 2889 }, { "epoch": 1.2025843017965316, "grad_norm": 1.679021954536438, "learning_rate": 1.3579217509039632e-05, "loss": 0.9337, "step": 2890 }, { "epoch": 1.2030002859891324, "grad_norm": 2.666064500808716, "learning_rate": 1.357503365434672e-05, "loss": 0.792, "step": 2891 }, { "epoch": 1.203416270181733, "grad_norm": 1.8489614725112915, "learning_rate": 1.3570849082031285e-05, "loss": 0.8361, "step": 2892 }, { "epoch": 1.2038322543743338, "grad_norm": 45.03181457519531, "learning_rate": 1.3566663792933311e-05, "loss": 0.8857, "step": 2893 }, { "epoch": 1.2042482385669344, "grad_norm": 1.7059372663497925, "learning_rate": 1.3562477787892913e-05, "loss": 0.8477, "step": 2894 }, { "epoch": 1.2046642227595352, "grad_norm": 1.6874780654907227, "learning_rate": 1.355829106775036e-05, "loss": 0.7748, "step": 2895 }, { "epoch": 1.2050802069521358, "grad_norm": 1.6395900249481201, "learning_rate": 1.3554103633346056e-05, "loss": 0.7983, "step": 2896 }, { "epoch": 1.2054961911447366, "grad_norm": 1.753096103668213, "learning_rate": 1.3549915485520556e-05, "loss": 0.8435, "step": 2897 }, { "epoch": 1.2059121753373372, "grad_norm": 1.75657057762146, "learning_rate": 1.3545726625114546e-05, "loss": 0.9258, "step": 2898 }, { "epoch": 1.2063281595299378, "grad_norm": 1.6314340829849243, "learning_rate": 1.3541537052968867e-05, "loss": 0.8452, "step": 2899 }, { "epoch": 1.2067441437225386, "grad_norm": 15.241218566894531, "learning_rate": 1.35373467699245e-05, "loss": 0.8692, "step": 2900 }, { "epoch": 1.2071601279151392, "grad_norm": 1.678993582725525, "learning_rate": 1.3533155776822569e-05, "loss": 0.8752, "step": 2901 }, { "epoch": 1.20757611210774, "grad_norm": 1.6113107204437256, "learning_rate": 1.3528964074504334e-05, "loss": 0.7233, "step": 2902 }, { "epoch": 1.2079920963003405, "grad_norm": 1.5328786373138428, "learning_rate": 1.3524771663811208e-05, "loss": 0.8113, "step": 2903 }, { "epoch": 1.2084080804929413, "grad_norm": 1.5881682634353638, "learning_rate": 1.3520578545584737e-05, "loss": 0.8627, "step": 2904 }, { "epoch": 1.208824064685542, "grad_norm": 1.724822759628296, "learning_rate": 1.3516384720666614e-05, "loss": 0.841, "step": 2905 }, { "epoch": 1.2092400488781427, "grad_norm": 1.6698477268218994, "learning_rate": 1.3512190189898668e-05, "loss": 0.9111, "step": 2906 }, { "epoch": 1.2096560330707433, "grad_norm": 1.7700649499893188, "learning_rate": 1.3507994954122883e-05, "loss": 0.8192, "step": 2907 }, { "epoch": 1.210072017263344, "grad_norm": 1.5044591426849365, "learning_rate": 1.350379901418137e-05, "loss": 0.711, "step": 2908 }, { "epoch": 1.2104880014559447, "grad_norm": 1.7091786861419678, "learning_rate": 1.3499602370916387e-05, "loss": 0.8371, "step": 2909 }, { "epoch": 1.2109039856485453, "grad_norm": 1.5980584621429443, "learning_rate": 1.3495405025170336e-05, "loss": 0.7741, "step": 2910 }, { "epoch": 1.211319969841146, "grad_norm": 101.56485748291016, "learning_rate": 1.3491206977785754e-05, "loss": 0.945, "step": 2911 }, { "epoch": 1.2117359540337467, "grad_norm": 1.5394775867462158, "learning_rate": 1.3487008229605328e-05, "loss": 0.8685, "step": 2912 }, { "epoch": 1.2121519382263475, "grad_norm": 1.7359504699707031, "learning_rate": 1.348280878147187e-05, "loss": 0.8176, "step": 2913 }, { "epoch": 1.212567922418948, "grad_norm": 1.562002420425415, "learning_rate": 1.3478608634228352e-05, "loss": 0.839, "step": 2914 }, { "epoch": 1.2129839066115489, "grad_norm": 1.5918631553649902, "learning_rate": 1.347440778871787e-05, "loss": 0.9387, "step": 2915 }, { "epoch": 1.2133998908041495, "grad_norm": 1.5672729015350342, "learning_rate": 1.347020624578367e-05, "loss": 0.9225, "step": 2916 }, { "epoch": 1.21381587499675, "grad_norm": 1.5457851886749268, "learning_rate": 1.3466004006269135e-05, "loss": 0.8307, "step": 2917 }, { "epoch": 1.2142318591893508, "grad_norm": 1.637861967086792, "learning_rate": 1.3461801071017788e-05, "loss": 0.7971, "step": 2918 }, { "epoch": 1.2146478433819514, "grad_norm": 1.5193549394607544, "learning_rate": 1.345759744087329e-05, "loss": 0.8672, "step": 2919 }, { "epoch": 1.2150638275745522, "grad_norm": 1.7430059909820557, "learning_rate": 1.3453393116679439e-05, "loss": 0.899, "step": 2920 }, { "epoch": 1.2154798117671528, "grad_norm": 1.6959712505340576, "learning_rate": 1.344918809928018e-05, "loss": 0.9771, "step": 2921 }, { "epoch": 1.2158957959597536, "grad_norm": 1.6448646783828735, "learning_rate": 1.3444982389519593e-05, "loss": 0.8271, "step": 2922 }, { "epoch": 1.2163117801523542, "grad_norm": 1.7954659461975098, "learning_rate": 1.3440775988241901e-05, "loss": 0.7688, "step": 2923 }, { "epoch": 1.216727764344955, "grad_norm": 1.7986496686935425, "learning_rate": 1.3436568896291456e-05, "loss": 0.9523, "step": 2924 }, { "epoch": 1.2171437485375556, "grad_norm": 1.8229833841323853, "learning_rate": 1.3432361114512754e-05, "loss": 0.888, "step": 2925 }, { "epoch": 1.2175597327301562, "grad_norm": 1.7677711248397827, "learning_rate": 1.3428152643750439e-05, "loss": 0.7483, "step": 2926 }, { "epoch": 1.217975716922757, "grad_norm": 1.7634488344192505, "learning_rate": 1.3423943484849275e-05, "loss": 0.9634, "step": 2927 }, { "epoch": 1.2183917011153576, "grad_norm": 1.731647253036499, "learning_rate": 1.3419733638654177e-05, "loss": 0.9755, "step": 2928 }, { "epoch": 1.2188076853079584, "grad_norm": 1.5575120449066162, "learning_rate": 1.3415523106010195e-05, "loss": 0.7654, "step": 2929 }, { "epoch": 1.219223669500559, "grad_norm": 1.5264196395874023, "learning_rate": 1.3411311887762515e-05, "loss": 0.7643, "step": 2930 }, { "epoch": 1.2196396536931597, "grad_norm": 1.7276654243469238, "learning_rate": 1.3407099984756462e-05, "loss": 0.9198, "step": 2931 }, { "epoch": 1.2200556378857603, "grad_norm": 1.5850849151611328, "learning_rate": 1.34028873978375e-05, "loss": 0.9397, "step": 2932 }, { "epoch": 1.2204716220783611, "grad_norm": 1.5934752225875854, "learning_rate": 1.339867412785123e-05, "loss": 0.9601, "step": 2933 }, { "epoch": 1.2208876062709617, "grad_norm": 1.7165099382400513, "learning_rate": 1.3394460175643382e-05, "loss": 0.8605, "step": 2934 }, { "epoch": 1.2213035904635623, "grad_norm": 1.5579270124435425, "learning_rate": 1.3390245542059836e-05, "loss": 0.9349, "step": 2935 }, { "epoch": 1.221719574656163, "grad_norm": 1.5646495819091797, "learning_rate": 1.33860302279466e-05, "loss": 0.7751, "step": 2936 }, { "epoch": 1.2221355588487637, "grad_norm": 56.189517974853516, "learning_rate": 1.3381814234149825e-05, "loss": 0.8967, "step": 2937 }, { "epoch": 1.2225515430413645, "grad_norm": 1.6534361839294434, "learning_rate": 1.3377597561515787e-05, "loss": 0.8026, "step": 2938 }, { "epoch": 1.222967527233965, "grad_norm": 1.7945903539657593, "learning_rate": 1.3373380210890912e-05, "loss": 0.8621, "step": 2939 }, { "epoch": 1.2233835114265659, "grad_norm": 1.6299315690994263, "learning_rate": 1.3369162183121754e-05, "loss": 0.8154, "step": 2940 }, { "epoch": 1.2237994956191665, "grad_norm": 1.6534262895584106, "learning_rate": 1.3364943479055003e-05, "loss": 0.8756, "step": 2941 }, { "epoch": 1.2242154798117673, "grad_norm": 1.5339009761810303, "learning_rate": 1.3360724099537492e-05, "loss": 0.8478, "step": 2942 }, { "epoch": 1.2246314640043678, "grad_norm": 1.644388198852539, "learning_rate": 1.3356504045416176e-05, "loss": 0.818, "step": 2943 }, { "epoch": 1.2250474481969684, "grad_norm": 8056.1806640625, "learning_rate": 1.3352283317538162e-05, "loss": 0.8343, "step": 2944 }, { "epoch": 1.2254634323895692, "grad_norm": 1.5388972759246826, "learning_rate": 1.3348061916750676e-05, "loss": 0.7873, "step": 2945 }, { "epoch": 1.2258794165821698, "grad_norm": 1.8096238374710083, "learning_rate": 1.3343839843901095e-05, "loss": 0.8075, "step": 2946 }, { "epoch": 1.2262954007747706, "grad_norm": 1.7053147554397583, "learning_rate": 1.3339617099836917e-05, "loss": 0.9103, "step": 2947 }, { "epoch": 1.2267113849673712, "grad_norm": 1.6315842866897583, "learning_rate": 1.333539368540578e-05, "loss": 0.7784, "step": 2948 }, { "epoch": 1.227127369159972, "grad_norm": 1.834139108657837, "learning_rate": 1.333116960145546e-05, "loss": 0.8323, "step": 2949 }, { "epoch": 1.2275433533525726, "grad_norm": 1.5859681367874146, "learning_rate": 1.3326944848833863e-05, "loss": 0.7497, "step": 2950 }, { "epoch": 1.2279593375451734, "grad_norm": 1.7775541543960571, "learning_rate": 1.3322719428389033e-05, "loss": 0.9501, "step": 2951 }, { "epoch": 1.228375321737774, "grad_norm": 1.7171796560287476, "learning_rate": 1.3318493340969143e-05, "loss": 0.8528, "step": 2952 }, { "epoch": 1.2287913059303746, "grad_norm": 1.7749356031417847, "learning_rate": 1.3314266587422501e-05, "loss": 0.9261, "step": 2953 }, { "epoch": 1.2292072901229754, "grad_norm": 1.6113051176071167, "learning_rate": 1.3310039168597557e-05, "loss": 0.8394, "step": 2954 }, { "epoch": 1.229623274315576, "grad_norm": 1.7764079570770264, "learning_rate": 1.3305811085342882e-05, "loss": 0.8998, "step": 2955 }, { "epoch": 1.2300392585081767, "grad_norm": 1.758673071861267, "learning_rate": 1.330158233850719e-05, "loss": 0.9209, "step": 2956 }, { "epoch": 1.2304552427007773, "grad_norm": 5.361945152282715, "learning_rate": 1.3297352928939322e-05, "loss": 0.9438, "step": 2957 }, { "epoch": 1.2308712268933781, "grad_norm": 1.6354955434799194, "learning_rate": 1.3293122857488254e-05, "loss": 0.8024, "step": 2958 }, { "epoch": 1.2312872110859787, "grad_norm": 225.95013427734375, "learning_rate": 1.3288892125003097e-05, "loss": 0.7905, "step": 2959 }, { "epoch": 1.2317031952785795, "grad_norm": 1.634956955909729, "learning_rate": 1.328466073233309e-05, "loss": 0.7274, "step": 2960 }, { "epoch": 1.23211917947118, "grad_norm": 1.642680287361145, "learning_rate": 1.3280428680327613e-05, "loss": 0.846, "step": 2961 }, { "epoch": 1.2325351636637807, "grad_norm": 1.648070216178894, "learning_rate": 1.327619596983617e-05, "loss": 0.8988, "step": 2962 }, { "epoch": 1.2329511478563815, "grad_norm": 1.625085473060608, "learning_rate": 1.32719626017084e-05, "loss": 0.9243, "step": 2963 }, { "epoch": 1.233367132048982, "grad_norm": 9.616291999816895, "learning_rate": 1.3267728576794072e-05, "loss": 0.8286, "step": 2964 }, { "epoch": 1.2337831162415829, "grad_norm": 1.7192878723144531, "learning_rate": 1.3263493895943093e-05, "loss": 0.7928, "step": 2965 }, { "epoch": 1.2341991004341835, "grad_norm": 1.6676472425460815, "learning_rate": 1.3259258560005495e-05, "loss": 0.8725, "step": 2966 }, { "epoch": 1.2346150846267843, "grad_norm": 1.7483270168304443, "learning_rate": 1.3255022569831449e-05, "loss": 0.8937, "step": 2967 }, { "epoch": 1.2350310688193848, "grad_norm": 1.916174054145813, "learning_rate": 1.325078592627124e-05, "loss": 0.8937, "step": 2968 }, { "epoch": 1.2354470530119857, "grad_norm": 1.6790452003479004, "learning_rate": 1.3246548630175308e-05, "loss": 0.7853, "step": 2969 }, { "epoch": 1.2358630372045862, "grad_norm": 1.684643030166626, "learning_rate": 1.3242310682394212e-05, "loss": 0.8076, "step": 2970 }, { "epoch": 1.2362790213971868, "grad_norm": 1.847639560699463, "learning_rate": 1.323807208377864e-05, "loss": 0.7797, "step": 2971 }, { "epoch": 1.2366950055897876, "grad_norm": 1.554282307624817, "learning_rate": 1.3233832835179412e-05, "loss": 0.8062, "step": 2972 }, { "epoch": 1.2371109897823882, "grad_norm": 1.59995436668396, "learning_rate": 1.322959293744748e-05, "loss": 0.739, "step": 2973 }, { "epoch": 1.237526973974989, "grad_norm": 1.6242272853851318, "learning_rate": 1.3225352391433926e-05, "loss": 0.7754, "step": 2974 }, { "epoch": 1.2379429581675896, "grad_norm": 1.7475696802139282, "learning_rate": 1.322111119798996e-05, "loss": 0.9827, "step": 2975 }, { "epoch": 1.2383589423601904, "grad_norm": 1.683232307434082, "learning_rate": 1.321686935796693e-05, "loss": 0.8526, "step": 2976 }, { "epoch": 1.238774926552791, "grad_norm": 1.7568763494491577, "learning_rate": 1.3212626872216301e-05, "loss": 0.8759, "step": 2977 }, { "epoch": 1.2391909107453918, "grad_norm": 26.32274055480957, "learning_rate": 1.3208383741589677e-05, "loss": 0.9559, "step": 2978 }, { "epoch": 1.2396068949379924, "grad_norm": 2.5255839824676514, "learning_rate": 1.3204139966938788e-05, "loss": 0.9052, "step": 2979 }, { "epoch": 1.240022879130593, "grad_norm": 1.6135581731796265, "learning_rate": 1.3199895549115498e-05, "loss": 0.8108, "step": 2980 }, { "epoch": 1.2404388633231938, "grad_norm": 1.6399261951446533, "learning_rate": 1.319565048897179e-05, "loss": 0.8408, "step": 2981 }, { "epoch": 1.2408548475157943, "grad_norm": 1.6710995435714722, "learning_rate": 1.3191404787359788e-05, "loss": 0.8359, "step": 2982 }, { "epoch": 1.2412708317083951, "grad_norm": 1.811625361442566, "learning_rate": 1.3187158445131731e-05, "loss": 0.9371, "step": 2983 }, { "epoch": 1.2416868159009957, "grad_norm": 2.003596305847168, "learning_rate": 1.318291146314e-05, "loss": 0.8129, "step": 2984 }, { "epoch": 1.2421028000935965, "grad_norm": 1.7279551029205322, "learning_rate": 1.3178663842237101e-05, "loss": 0.7865, "step": 2985 }, { "epoch": 1.242518784286197, "grad_norm": 1.8010424375534058, "learning_rate": 1.3174415583275664e-05, "loss": 0.8754, "step": 2986 }, { "epoch": 1.242934768478798, "grad_norm": 1.651576042175293, "learning_rate": 1.3170166687108443e-05, "loss": 0.8648, "step": 2987 }, { "epoch": 1.2433507526713985, "grad_norm": 1.8686747550964355, "learning_rate": 1.3165917154588334e-05, "loss": 0.9891, "step": 2988 }, { "epoch": 1.243766736863999, "grad_norm": 1.6223989725112915, "learning_rate": 1.316166698656835e-05, "loss": 0.9055, "step": 2989 }, { "epoch": 1.2441827210565999, "grad_norm": 1.7267487049102783, "learning_rate": 1.3157416183901636e-05, "loss": 0.845, "step": 2990 }, { "epoch": 1.2445987052492005, "grad_norm": 1.7807589769363403, "learning_rate": 1.3153164747441462e-05, "loss": 0.8604, "step": 2991 }, { "epoch": 1.2450146894418013, "grad_norm": 1.771832823753357, "learning_rate": 1.3148912678041224e-05, "loss": 0.871, "step": 2992 }, { "epoch": 1.2454306736344019, "grad_norm": 1.6828538179397583, "learning_rate": 1.3144659976554448e-05, "loss": 0.8557, "step": 2993 }, { "epoch": 1.2458466578270027, "grad_norm": 1.626953363418579, "learning_rate": 1.3140406643834785e-05, "loss": 0.7978, "step": 2994 }, { "epoch": 1.2462626420196032, "grad_norm": 231.06838989257812, "learning_rate": 1.3136152680736016e-05, "loss": 0.914, "step": 2995 }, { "epoch": 1.246678626212204, "grad_norm": 1.6303863525390625, "learning_rate": 1.3131898088112045e-05, "loss": 0.874, "step": 2996 }, { "epoch": 1.2470946104048046, "grad_norm": 1.7035410404205322, "learning_rate": 1.3127642866816904e-05, "loss": 0.8126, "step": 2997 }, { "epoch": 1.2475105945974052, "grad_norm": 1.6531884670257568, "learning_rate": 1.312338701770475e-05, "loss": 0.7802, "step": 2998 }, { "epoch": 1.247926578790006, "grad_norm": 1.6221939325332642, "learning_rate": 1.3119130541629863e-05, "loss": 0.8687, "step": 2999 }, { "epoch": 1.2483425629826066, "grad_norm": 1.5726697444915771, "learning_rate": 1.3114873439446663e-05, "loss": 0.9318, "step": 3000 }, { "epoch": 1.2483425629826066, "eval_loss": 0.7876019477844238, "eval_runtime": 1801.539, "eval_samples_per_second": 3.659, "eval_steps_per_second": 1.83, "step": 3000 }, { "epoch": 1.2487585471752074, "grad_norm": 1.765418529510498, "learning_rate": 1.3110615712009675e-05, "loss": 0.8342, "step": 3001 }, { "epoch": 1.249174531367808, "grad_norm": 1.7027384042739868, "learning_rate": 1.3106357360173565e-05, "loss": 0.8686, "step": 3002 }, { "epoch": 1.2495905155604088, "grad_norm": 1.6146013736724854, "learning_rate": 1.3102098384793118e-05, "loss": 0.7989, "step": 3003 }, { "epoch": 1.2500064997530094, "grad_norm": 1.7778451442718506, "learning_rate": 1.3097838786723245e-05, "loss": 0.9008, "step": 3004 }, { "epoch": 1.2504224839456102, "grad_norm": 1.8046547174453735, "learning_rate": 1.309357856681898e-05, "loss": 0.8434, "step": 3005 }, { "epoch": 1.2508384681382108, "grad_norm": 18.91399383544922, "learning_rate": 1.3089317725935491e-05, "loss": 0.7941, "step": 3006 }, { "epoch": 1.2512544523308113, "grad_norm": 2.1271610260009766, "learning_rate": 1.3085056264928058e-05, "loss": 0.9143, "step": 3007 }, { "epoch": 1.2516704365234121, "grad_norm": 1.6269609928131104, "learning_rate": 1.308079418465209e-05, "loss": 0.7698, "step": 3008 }, { "epoch": 1.2520864207160127, "grad_norm": 1.684873104095459, "learning_rate": 1.3076531485963126e-05, "loss": 0.8304, "step": 3009 }, { "epoch": 1.2525024049086135, "grad_norm": 1.7241542339324951, "learning_rate": 1.3072268169716821e-05, "loss": 0.8951, "step": 3010 }, { "epoch": 1.2529183891012141, "grad_norm": 1.5613596439361572, "learning_rate": 1.3068004236768963e-05, "loss": 0.7298, "step": 3011 }, { "epoch": 1.2533343732938147, "grad_norm": 1.6773099899291992, "learning_rate": 1.3063739687975452e-05, "loss": 0.8538, "step": 3012 }, { "epoch": 1.2537503574864155, "grad_norm": 1.5912449359893799, "learning_rate": 1.3059474524192321e-05, "loss": 0.8044, "step": 3013 }, { "epoch": 1.2541663416790163, "grad_norm": 1.711155652999878, "learning_rate": 1.3055208746275722e-05, "loss": 0.8354, "step": 3014 }, { "epoch": 1.2545823258716169, "grad_norm": 1.6631335020065308, "learning_rate": 1.3050942355081936e-05, "loss": 0.889, "step": 3015 }, { "epoch": 1.2549983100642175, "grad_norm": 1.6200828552246094, "learning_rate": 1.304667535146736e-05, "loss": 0.8001, "step": 3016 }, { "epoch": 1.2554142942568183, "grad_norm": 1.6777859926223755, "learning_rate": 1.3042407736288514e-05, "loss": 0.8729, "step": 3017 }, { "epoch": 1.2558302784494189, "grad_norm": 1.578330159187317, "learning_rate": 1.3038139510402049e-05, "loss": 0.8629, "step": 3018 }, { "epoch": 1.2562462626420197, "grad_norm": 1.7547500133514404, "learning_rate": 1.3033870674664728e-05, "loss": 0.8187, "step": 3019 }, { "epoch": 1.2566622468346202, "grad_norm": 1.6658179759979248, "learning_rate": 1.3029601229933445e-05, "loss": 0.8754, "step": 3020 }, { "epoch": 1.2570782310272208, "grad_norm": 1.603269100189209, "learning_rate": 1.3025331177065214e-05, "loss": 0.7645, "step": 3021 }, { "epoch": 1.2574942152198216, "grad_norm": 1.676407814025879, "learning_rate": 1.3021060516917167e-05, "loss": 0.8052, "step": 3022 }, { "epoch": 1.2579101994124224, "grad_norm": 1.6383652687072754, "learning_rate": 1.301678925034656e-05, "loss": 0.9282, "step": 3023 }, { "epoch": 1.258326183605023, "grad_norm": 1.6595486402511597, "learning_rate": 1.3012517378210774e-05, "loss": 0.7898, "step": 3024 }, { "epoch": 1.2587421677976236, "grad_norm": 1.7881600856781006, "learning_rate": 1.3008244901367308e-05, "loss": 0.8602, "step": 3025 }, { "epoch": 1.2591581519902244, "grad_norm": 1.6696876287460327, "learning_rate": 1.3003971820673787e-05, "loss": 0.894, "step": 3026 }, { "epoch": 1.259574136182825, "grad_norm": 1.5479304790496826, "learning_rate": 1.2999698136987946e-05, "loss": 0.7963, "step": 3027 }, { "epoch": 1.2599901203754258, "grad_norm": 1.667256236076355, "learning_rate": 1.2995423851167654e-05, "loss": 0.8394, "step": 3028 }, { "epoch": 1.2604061045680264, "grad_norm": 1.8068958520889282, "learning_rate": 1.2991148964070894e-05, "loss": 0.921, "step": 3029 }, { "epoch": 1.260822088760627, "grad_norm": 1.622130274772644, "learning_rate": 1.2986873476555775e-05, "loss": 0.7628, "step": 3030 }, { "epoch": 1.2612380729532278, "grad_norm": 1.557589054107666, "learning_rate": 1.2982597389480518e-05, "loss": 0.8433, "step": 3031 }, { "epoch": 1.2616540571458286, "grad_norm": 168.91513061523438, "learning_rate": 1.297832070370347e-05, "loss": 0.8224, "step": 3032 }, { "epoch": 1.2620700413384291, "grad_norm": 434.7958068847656, "learning_rate": 1.29740434200831e-05, "loss": 0.7721, "step": 3033 }, { "epoch": 1.2624860255310297, "grad_norm": 17.36846923828125, "learning_rate": 1.296976553947799e-05, "loss": 0.7512, "step": 3034 }, { "epoch": 1.2629020097236305, "grad_norm": 1.9012188911437988, "learning_rate": 1.2965487062746851e-05, "loss": 1.01, "step": 3035 }, { "epoch": 1.2633179939162311, "grad_norm": 1.6986640691757202, "learning_rate": 1.2961207990748505e-05, "loss": 0.8957, "step": 3036 }, { "epoch": 1.263733978108832, "grad_norm": 1.540952205657959, "learning_rate": 1.29569283243419e-05, "loss": 0.7526, "step": 3037 }, { "epoch": 1.2641499623014325, "grad_norm": 1.7327423095703125, "learning_rate": 1.29526480643861e-05, "loss": 0.9085, "step": 3038 }, { "epoch": 1.264565946494033, "grad_norm": 1.6806895732879639, "learning_rate": 1.2948367211740287e-05, "loss": 0.8288, "step": 3039 }, { "epoch": 1.264981930686634, "grad_norm": 219.82257080078125, "learning_rate": 1.2944085767263766e-05, "loss": 0.802, "step": 3040 }, { "epoch": 1.2653979148792347, "grad_norm": 1.6541669368743896, "learning_rate": 1.2939803731815959e-05, "loss": 0.9464, "step": 3041 }, { "epoch": 1.2658138990718353, "grad_norm": 1.7695419788360596, "learning_rate": 1.2935521106256401e-05, "loss": 0.8431, "step": 3042 }, { "epoch": 1.2662298832644359, "grad_norm": 22.9129638671875, "learning_rate": 1.2931237891444757e-05, "loss": 0.9289, "step": 3043 }, { "epoch": 1.2666458674570367, "grad_norm": 3554.871337890625, "learning_rate": 1.2926954088240797e-05, "loss": 0.92, "step": 3044 }, { "epoch": 1.2670618516496372, "grad_norm": 1.6501548290252686, "learning_rate": 1.2922669697504426e-05, "loss": 0.7897, "step": 3045 }, { "epoch": 1.267477835842238, "grad_norm": 1.5963736772537231, "learning_rate": 1.2918384720095649e-05, "loss": 0.8324, "step": 3046 }, { "epoch": 1.2678938200348386, "grad_norm": 1.702226161956787, "learning_rate": 1.2914099156874598e-05, "loss": 0.8031, "step": 3047 }, { "epoch": 1.2683098042274392, "grad_norm": 1.6123780012130737, "learning_rate": 1.2909813008701524e-05, "loss": 0.8023, "step": 3048 }, { "epoch": 1.26872578842004, "grad_norm": 1.6115134954452515, "learning_rate": 1.2905526276436788e-05, "loss": 0.8363, "step": 3049 }, { "epoch": 1.2691417726126408, "grad_norm": 174.75643920898438, "learning_rate": 1.2901238960940883e-05, "loss": 0.9221, "step": 3050 }, { "epoch": 1.2695577568052414, "grad_norm": 574.1459350585938, "learning_rate": 1.2896951063074396e-05, "loss": 0.7622, "step": 3051 }, { "epoch": 1.269973740997842, "grad_norm": 1.7980880737304688, "learning_rate": 1.2892662583698053e-05, "loss": 0.8559, "step": 3052 }, { "epoch": 1.2703897251904428, "grad_norm": 1.9537345170974731, "learning_rate": 1.2888373523672681e-05, "loss": 0.8884, "step": 3053 }, { "epoch": 1.2708057093830434, "grad_norm": 1.8089393377304077, "learning_rate": 1.2884083883859235e-05, "loss": 0.8678, "step": 3054 }, { "epoch": 1.2712216935756442, "grad_norm": 1.7114800214767456, "learning_rate": 1.2879793665118785e-05, "loss": 0.8177, "step": 3055 }, { "epoch": 1.2716376777682448, "grad_norm": 1.76088285446167, "learning_rate": 1.2875502868312506e-05, "loss": 0.7924, "step": 3056 }, { "epoch": 1.2720536619608453, "grad_norm": 98.06652069091797, "learning_rate": 1.28712114943017e-05, "loss": 0.846, "step": 3057 }, { "epoch": 1.2724696461534462, "grad_norm": 1.7598214149475098, "learning_rate": 1.2866919543947783e-05, "loss": 0.7707, "step": 3058 }, { "epoch": 1.272885630346047, "grad_norm": 1.7390919923782349, "learning_rate": 1.2862627018112282e-05, "loss": 0.8952, "step": 3059 }, { "epoch": 1.2733016145386475, "grad_norm": 1.6321442127227783, "learning_rate": 1.2858333917656847e-05, "loss": 0.7919, "step": 3060 }, { "epoch": 1.2737175987312481, "grad_norm": 1.6450945138931274, "learning_rate": 1.2854040243443238e-05, "loss": 0.7384, "step": 3061 }, { "epoch": 1.274133582923849, "grad_norm": 1.677937626838684, "learning_rate": 1.2849745996333329e-05, "loss": 0.8024, "step": 3062 }, { "epoch": 1.2745495671164495, "grad_norm": 1.6751705408096313, "learning_rate": 1.2845451177189115e-05, "loss": 0.8176, "step": 3063 }, { "epoch": 1.2749655513090503, "grad_norm": 1.697293758392334, "learning_rate": 1.2841155786872702e-05, "loss": 0.822, "step": 3064 }, { "epoch": 1.275381535501651, "grad_norm": 1.529319167137146, "learning_rate": 1.2836859826246308e-05, "loss": 0.8109, "step": 3065 }, { "epoch": 1.2757975196942515, "grad_norm": 1.6466013193130493, "learning_rate": 1.2832563296172272e-05, "loss": 0.8035, "step": 3066 }, { "epoch": 1.2762135038868523, "grad_norm": 1.725010871887207, "learning_rate": 1.2828266197513037e-05, "loss": 0.8767, "step": 3067 }, { "epoch": 1.276629488079453, "grad_norm": 1.4919778108596802, "learning_rate": 1.2823968531131175e-05, "loss": 0.8587, "step": 3068 }, { "epoch": 1.2770454722720537, "grad_norm": 1.6925872564315796, "learning_rate": 1.2819670297889361e-05, "loss": 0.8199, "step": 3069 }, { "epoch": 1.2774614564646543, "grad_norm": 1.508147954940796, "learning_rate": 1.2815371498650387e-05, "loss": 0.7163, "step": 3070 }, { "epoch": 1.277877440657255, "grad_norm": 1.8000731468200684, "learning_rate": 1.2811072134277153e-05, "loss": 0.9406, "step": 3071 }, { "epoch": 1.2782934248498556, "grad_norm": 41.22207260131836, "learning_rate": 1.2806772205632685e-05, "loss": 0.8005, "step": 3072 }, { "epoch": 1.2787094090424564, "grad_norm": 1.726135492324829, "learning_rate": 1.2802471713580112e-05, "loss": 0.8338, "step": 3073 }, { "epoch": 1.279125393235057, "grad_norm": 1.7409777641296387, "learning_rate": 1.2798170658982677e-05, "loss": 0.8881, "step": 3074 }, { "epoch": 1.2795413774276576, "grad_norm": 1.6165292263031006, "learning_rate": 1.279386904270374e-05, "loss": 0.863, "step": 3075 }, { "epoch": 1.2799573616202584, "grad_norm": 1.6406785249710083, "learning_rate": 1.2789566865606771e-05, "loss": 0.7241, "step": 3076 }, { "epoch": 1.2803733458128592, "grad_norm": 1.8368966579437256, "learning_rate": 1.2785264128555355e-05, "loss": 0.8109, "step": 3077 }, { "epoch": 1.2807893300054598, "grad_norm": 1.7607641220092773, "learning_rate": 1.2780960832413185e-05, "loss": 0.9295, "step": 3078 }, { "epoch": 1.2812053141980604, "grad_norm": 1.5738754272460938, "learning_rate": 1.277665697804407e-05, "loss": 0.789, "step": 3079 }, { "epoch": 1.2816212983906612, "grad_norm": 2.483452558517456, "learning_rate": 1.2772352566311933e-05, "loss": 0.8275, "step": 3080 }, { "epoch": 1.2820372825832618, "grad_norm": 26.986129760742188, "learning_rate": 1.27680475980808e-05, "loss": 0.8711, "step": 3081 }, { "epoch": 1.2824532667758626, "grad_norm": 1.883450984954834, "learning_rate": 1.2763742074214815e-05, "loss": 0.7536, "step": 3082 }, { "epoch": 1.2828692509684632, "grad_norm": 1.6226739883422852, "learning_rate": 1.275943599557824e-05, "loss": 0.7481, "step": 3083 }, { "epoch": 1.2832852351610637, "grad_norm": 1.7203130722045898, "learning_rate": 1.2755129363035435e-05, "loss": 0.8739, "step": 3084 }, { "epoch": 1.2837012193536645, "grad_norm": 1.6703741550445557, "learning_rate": 1.2750822177450882e-05, "loss": 0.8861, "step": 3085 }, { "epoch": 1.2841172035462654, "grad_norm": 1.6995817422866821, "learning_rate": 1.2746514439689162e-05, "loss": 0.8483, "step": 3086 }, { "epoch": 1.284533187738866, "grad_norm": 1.7897770404815674, "learning_rate": 1.2742206150614982e-05, "loss": 0.9078, "step": 3087 }, { "epoch": 1.2849491719314665, "grad_norm": 1.8579500913619995, "learning_rate": 1.2737897311093149e-05, "loss": 0.8656, "step": 3088 }, { "epoch": 1.2853651561240673, "grad_norm": 1.5313000679016113, "learning_rate": 1.2733587921988584e-05, "loss": 0.7211, "step": 3089 }, { "epoch": 1.285781140316668, "grad_norm": 1.6238982677459717, "learning_rate": 1.2729277984166317e-05, "loss": 0.7773, "step": 3090 }, { "epoch": 1.2861971245092687, "grad_norm": 1.751371145248413, "learning_rate": 1.2724967498491492e-05, "loss": 0.8818, "step": 3091 }, { "epoch": 1.2866131087018693, "grad_norm": 1.6204757690429688, "learning_rate": 1.2720656465829357e-05, "loss": 0.8156, "step": 3092 }, { "epoch": 1.2870290928944699, "grad_norm": 1.6369833946228027, "learning_rate": 1.2716344887045275e-05, "loss": 0.8653, "step": 3093 }, { "epoch": 1.2874450770870707, "grad_norm": 1.6399904489517212, "learning_rate": 1.2712032763004718e-05, "loss": 0.7467, "step": 3094 }, { "epoch": 1.2878610612796715, "grad_norm": 1.553790807723999, "learning_rate": 1.2707720094573261e-05, "loss": 0.7562, "step": 3095 }, { "epoch": 1.288277045472272, "grad_norm": 1.7133619785308838, "learning_rate": 1.2703406882616594e-05, "loss": 0.8435, "step": 3096 }, { "epoch": 1.2886930296648726, "grad_norm": 1.6597098112106323, "learning_rate": 1.269909312800052e-05, "loss": 0.8389, "step": 3097 }, { "epoch": 1.2891090138574735, "grad_norm": 135.83172607421875, "learning_rate": 1.269477883159094e-05, "loss": 0.8236, "step": 3098 }, { "epoch": 1.289524998050074, "grad_norm": 2075.776611328125, "learning_rate": 1.2690463994253874e-05, "loss": 0.8695, "step": 3099 }, { "epoch": 1.2899409822426748, "grad_norm": 1.7698626518249512, "learning_rate": 1.2686148616855447e-05, "loss": 0.9504, "step": 3100 }, { "epoch": 1.2903569664352754, "grad_norm": 1.7286595106124878, "learning_rate": 1.2681832700261889e-05, "loss": 0.9355, "step": 3101 }, { "epoch": 1.290772950627876, "grad_norm": 1.7840816974639893, "learning_rate": 1.2677516245339543e-05, "loss": 1.0013, "step": 3102 }, { "epoch": 1.2911889348204768, "grad_norm": 1.580063819885254, "learning_rate": 1.2673199252954858e-05, "loss": 0.7858, "step": 3103 }, { "epoch": 1.2916049190130776, "grad_norm": 1.641584038734436, "learning_rate": 1.2668881723974391e-05, "loss": 0.8249, "step": 3104 }, { "epoch": 1.2920209032056782, "grad_norm": 1.7292574644088745, "learning_rate": 1.2664563659264807e-05, "loss": 0.9493, "step": 3105 }, { "epoch": 1.2924368873982788, "grad_norm": 1.6926391124725342, "learning_rate": 1.266024505969288e-05, "loss": 0.8647, "step": 3106 }, { "epoch": 1.2928528715908796, "grad_norm": 1.6719151735305786, "learning_rate": 1.2655925926125488e-05, "loss": 0.9562, "step": 3107 }, { "epoch": 1.2932688557834802, "grad_norm": 1.5103909969329834, "learning_rate": 1.2651606259429616e-05, "loss": 0.7698, "step": 3108 }, { "epoch": 1.293684839976081, "grad_norm": 1.6800135374069214, "learning_rate": 1.2647286060472364e-05, "loss": 0.8491, "step": 3109 }, { "epoch": 1.2941008241686816, "grad_norm": 1.7006200551986694, "learning_rate": 1.2642965330120925e-05, "loss": 0.9875, "step": 3110 }, { "epoch": 1.2945168083612821, "grad_norm": 1.7183977365493774, "learning_rate": 1.2638644069242612e-05, "loss": 0.8202, "step": 3111 }, { "epoch": 1.294932792553883, "grad_norm": 1.5958596467971802, "learning_rate": 1.2634322278704836e-05, "loss": 0.8103, "step": 3112 }, { "epoch": 1.2953487767464837, "grad_norm": 1.5596203804016113, "learning_rate": 1.2629999959375118e-05, "loss": 0.883, "step": 3113 }, { "epoch": 1.2957647609390843, "grad_norm": 14.904485702514648, "learning_rate": 1.262567711212109e-05, "loss": 0.8495, "step": 3114 }, { "epoch": 1.296180745131685, "grad_norm": 1.5447850227355957, "learning_rate": 1.2621353737810475e-05, "loss": 0.8109, "step": 3115 }, { "epoch": 1.2965967293242857, "grad_norm": 1.6802268028259277, "learning_rate": 1.2617029837311117e-05, "loss": 0.903, "step": 3116 }, { "epoch": 1.2970127135168863, "grad_norm": 1.7358943223953247, "learning_rate": 1.2612705411490958e-05, "loss": 0.9543, "step": 3117 }, { "epoch": 1.297428697709487, "grad_norm": 1.674422025680542, "learning_rate": 1.2608380461218045e-05, "loss": 0.7302, "step": 3118 }, { "epoch": 1.2978446819020877, "grad_norm": 1.733620047569275, "learning_rate": 1.260405498736054e-05, "loss": 0.8871, "step": 3119 }, { "epoch": 1.2982606660946883, "grad_norm": 1.8037986755371094, "learning_rate": 1.2599728990786697e-05, "loss": 0.7969, "step": 3120 }, { "epoch": 1.298676650287289, "grad_norm": 1.6253690719604492, "learning_rate": 1.2595402472364877e-05, "loss": 0.8498, "step": 3121 }, { "epoch": 1.2990926344798899, "grad_norm": 1.6425808668136597, "learning_rate": 1.2591075432963558e-05, "loss": 0.7907, "step": 3122 }, { "epoch": 1.2995086186724905, "grad_norm": 1.672816276550293, "learning_rate": 1.2586747873451308e-05, "loss": 0.8396, "step": 3123 }, { "epoch": 1.299924602865091, "grad_norm": 1.8516045808792114, "learning_rate": 1.258241979469681e-05, "loss": 0.9344, "step": 3124 }, { "epoch": 1.3003405870576918, "grad_norm": 4.056027889251709, "learning_rate": 1.257809119756884e-05, "loss": 0.8426, "step": 3125 }, { "epoch": 1.3007565712502924, "grad_norm": 1.9025154113769531, "learning_rate": 1.257376208293629e-05, "loss": 0.865, "step": 3126 }, { "epoch": 1.3011725554428932, "grad_norm": 1.5580646991729736, "learning_rate": 1.2569432451668147e-05, "loss": 0.8709, "step": 3127 }, { "epoch": 1.3015885396354938, "grad_norm": 1.6333072185516357, "learning_rate": 1.2565102304633507e-05, "loss": 0.8849, "step": 3128 }, { "epoch": 1.3020045238280944, "grad_norm": 1.7758636474609375, "learning_rate": 1.256077164270157e-05, "loss": 0.9405, "step": 3129 }, { "epoch": 1.3024205080206952, "grad_norm": 1.6150784492492676, "learning_rate": 1.2556440466741632e-05, "loss": 0.708, "step": 3130 }, { "epoch": 1.302836492213296, "grad_norm": 1.6984751224517822, "learning_rate": 1.25521087776231e-05, "loss": 0.9089, "step": 3131 }, { "epoch": 1.3032524764058966, "grad_norm": 1.6678862571716309, "learning_rate": 1.2547776576215482e-05, "loss": 0.7946, "step": 3132 }, { "epoch": 1.3036684605984972, "grad_norm": 1789.314208984375, "learning_rate": 1.2543443863388385e-05, "loss": 0.7336, "step": 3133 }, { "epoch": 1.304084444791098, "grad_norm": 1.6880581378936768, "learning_rate": 1.2539110640011528e-05, "loss": 0.7719, "step": 3134 }, { "epoch": 1.3045004289836986, "grad_norm": 1.783953070640564, "learning_rate": 1.2534776906954718e-05, "loss": 0.8005, "step": 3135 }, { "epoch": 1.3049164131762994, "grad_norm": 1.5423928499221802, "learning_rate": 1.2530442665087874e-05, "loss": 0.8384, "step": 3136 }, { "epoch": 1.3053323973689, "grad_norm": 1.713207483291626, "learning_rate": 1.252610791528102e-05, "loss": 0.7751, "step": 3137 }, { "epoch": 1.3057483815615005, "grad_norm": 1.4718700647354126, "learning_rate": 1.2521772658404281e-05, "loss": 0.8432, "step": 3138 }, { "epoch": 1.3061643657541013, "grad_norm": 1.7852798700332642, "learning_rate": 1.251743689532787e-05, "loss": 0.909, "step": 3139 }, { "epoch": 1.3065803499467021, "grad_norm": 1.6074473857879639, "learning_rate": 1.2513100626922119e-05, "loss": 0.8062, "step": 3140 }, { "epoch": 1.3069963341393027, "grad_norm": 4.3446221351623535, "learning_rate": 1.250876385405745e-05, "loss": 0.8576, "step": 3141 }, { "epoch": 1.3074123183319033, "grad_norm": 1.648451328277588, "learning_rate": 1.2504426577604396e-05, "loss": 0.8405, "step": 3142 }, { "epoch": 1.307828302524504, "grad_norm": 1.5541130304336548, "learning_rate": 1.2500088798433583e-05, "loss": 0.8397, "step": 3143 }, { "epoch": 1.3082442867171047, "grad_norm": 1.6013462543487549, "learning_rate": 1.2495750517415744e-05, "loss": 0.7927, "step": 3144 }, { "epoch": 1.3086602709097055, "grad_norm": 1.605672836303711, "learning_rate": 1.2491411735421702e-05, "loss": 0.7641, "step": 3145 }, { "epoch": 1.309076255102306, "grad_norm": 1.7362979650497437, "learning_rate": 1.248707245332239e-05, "loss": 0.817, "step": 3146 }, { "epoch": 1.3094922392949067, "grad_norm": 1.6731176376342773, "learning_rate": 1.2482732671988846e-05, "loss": 0.9278, "step": 3147 }, { "epoch": 1.3099082234875075, "grad_norm": 1.7207258939743042, "learning_rate": 1.2478392392292195e-05, "loss": 0.9282, "step": 3148 }, { "epoch": 1.3103242076801083, "grad_norm": 1.6517530679702759, "learning_rate": 1.2474051615103675e-05, "loss": 0.8147, "step": 3149 }, { "epoch": 1.3107401918727088, "grad_norm": 1.6785954236984253, "learning_rate": 1.246971034129461e-05, "loss": 0.7835, "step": 3150 }, { "epoch": 1.3111561760653094, "grad_norm": 1.5542947053909302, "learning_rate": 1.2465368571736433e-05, "loss": 0.7488, "step": 3151 }, { "epoch": 1.3115721602579102, "grad_norm": 1.8626911640167236, "learning_rate": 1.246102630730068e-05, "loss": 0.9572, "step": 3152 }, { "epoch": 1.3119881444505108, "grad_norm": 1.7317556142807007, "learning_rate": 1.2456683548858977e-05, "loss": 0.8506, "step": 3153 }, { "epoch": 1.3124041286431116, "grad_norm": 1.6107505559921265, "learning_rate": 1.2452340297283052e-05, "loss": 0.8543, "step": 3154 }, { "epoch": 1.3128201128357122, "grad_norm": 1.5399768352508545, "learning_rate": 1.2447996553444735e-05, "loss": 0.8051, "step": 3155 }, { "epoch": 1.3132360970283128, "grad_norm": 1.6489214897155762, "learning_rate": 1.2443652318215951e-05, "loss": 0.7962, "step": 3156 }, { "epoch": 1.3136520812209136, "grad_norm": 1.5925397872924805, "learning_rate": 1.243930759246873e-05, "loss": 0.8023, "step": 3157 }, { "epoch": 1.3140680654135144, "grad_norm": 1.6537567377090454, "learning_rate": 1.2434962377075195e-05, "loss": 0.8246, "step": 3158 }, { "epoch": 1.314484049606115, "grad_norm": 15.013288497924805, "learning_rate": 1.2430616672907566e-05, "loss": 0.816, "step": 3159 }, { "epoch": 1.3149000337987156, "grad_norm": 1.7393356561660767, "learning_rate": 1.2426270480838162e-05, "loss": 0.815, "step": 3160 }, { "epoch": 1.3153160179913164, "grad_norm": 1.6252987384796143, "learning_rate": 1.2421923801739404e-05, "loss": 0.7757, "step": 3161 }, { "epoch": 1.315732002183917, "grad_norm": 1.7353605031967163, "learning_rate": 1.241757663648381e-05, "loss": 0.8443, "step": 3162 }, { "epoch": 1.3161479863765178, "grad_norm": 1.8222543001174927, "learning_rate": 1.241322898594399e-05, "loss": 0.7728, "step": 3163 }, { "epoch": 1.3165639705691183, "grad_norm": 1.5804499387741089, "learning_rate": 1.2408880850992661e-05, "loss": 0.7943, "step": 3164 }, { "epoch": 1.316979954761719, "grad_norm": 1.5961850881576538, "learning_rate": 1.2404532232502625e-05, "loss": 0.802, "step": 3165 }, { "epoch": 1.3173959389543197, "grad_norm": 1.652761459350586, "learning_rate": 1.2400183131346787e-05, "loss": 0.7295, "step": 3166 }, { "epoch": 1.3178119231469205, "grad_norm": 1.709328055381775, "learning_rate": 1.2395833548398156e-05, "loss": 0.8362, "step": 3167 }, { "epoch": 1.318227907339521, "grad_norm": 1.6391358375549316, "learning_rate": 1.2391483484529827e-05, "loss": 0.713, "step": 3168 }, { "epoch": 1.3186438915321217, "grad_norm": 1.651878833770752, "learning_rate": 1.2387132940614997e-05, "loss": 0.8325, "step": 3169 }, { "epoch": 1.3190598757247225, "grad_norm": 1.7611150741577148, "learning_rate": 1.2382781917526955e-05, "loss": 0.8991, "step": 3170 }, { "epoch": 1.319475859917323, "grad_norm": 1.7665685415267944, "learning_rate": 1.2378430416139091e-05, "loss": 0.7888, "step": 3171 }, { "epoch": 1.3198918441099239, "grad_norm": 1.7140825986862183, "learning_rate": 1.2374078437324892e-05, "loss": 0.8859, "step": 3172 }, { "epoch": 1.3203078283025245, "grad_norm": 1.6234874725341797, "learning_rate": 1.2369725981957936e-05, "loss": 0.8133, "step": 3173 }, { "epoch": 1.320723812495125, "grad_norm": 1.6136939525604248, "learning_rate": 1.2365373050911898e-05, "loss": 0.8499, "step": 3174 }, { "epoch": 1.3211397966877259, "grad_norm": 1.5917271375656128, "learning_rate": 1.2361019645060548e-05, "loss": 0.7756, "step": 3175 }, { "epoch": 1.3215557808803267, "grad_norm": 1.6118073463439941, "learning_rate": 1.2356665765277753e-05, "loss": 0.7751, "step": 3176 }, { "epoch": 1.3219717650729272, "grad_norm": 1.5659191608428955, "learning_rate": 1.2352311412437476e-05, "loss": 0.8854, "step": 3177 }, { "epoch": 1.3223877492655278, "grad_norm": 1.5266833305358887, "learning_rate": 1.2347956587413777e-05, "loss": 0.7528, "step": 3178 }, { "epoch": 1.3228037334581286, "grad_norm": 1.7216219902038574, "learning_rate": 1.2343601291080798e-05, "loss": 0.8206, "step": 3179 }, { "epoch": 1.3232197176507292, "grad_norm": 1.6693487167358398, "learning_rate": 1.2339245524312793e-05, "loss": 0.8434, "step": 3180 }, { "epoch": 1.32363570184333, "grad_norm": 1.6586097478866577, "learning_rate": 1.2334889287984098e-05, "loss": 0.8659, "step": 3181 }, { "epoch": 1.3240516860359306, "grad_norm": 1.6960145235061646, "learning_rate": 1.2330532582969148e-05, "loss": 0.7809, "step": 3182 }, { "epoch": 1.3244676702285312, "grad_norm": 1.6201423406600952, "learning_rate": 1.2326175410142478e-05, "loss": 0.9023, "step": 3183 }, { "epoch": 1.324883654421132, "grad_norm": 88.71733856201172, "learning_rate": 1.2321817770378703e-05, "loss": 0.8461, "step": 3184 }, { "epoch": 1.3252996386137328, "grad_norm": 1.7246116399765015, "learning_rate": 1.2317459664552543e-05, "loss": 0.8321, "step": 3185 }, { "epoch": 1.3257156228063334, "grad_norm": 1.7761610746383667, "learning_rate": 1.2313101093538806e-05, "loss": 0.8184, "step": 3186 }, { "epoch": 1.326131606998934, "grad_norm": 1.6582329273223877, "learning_rate": 1.2308742058212398e-05, "loss": 0.8351, "step": 3187 }, { "epoch": 1.3265475911915348, "grad_norm": 1.7327123880386353, "learning_rate": 1.2304382559448314e-05, "loss": 0.8123, "step": 3188 }, { "epoch": 1.3269635753841353, "grad_norm": 1.72944974899292, "learning_rate": 1.230002259812164e-05, "loss": 0.9017, "step": 3189 }, { "epoch": 1.3273795595767361, "grad_norm": 1.6885980367660522, "learning_rate": 1.2295662175107566e-05, "loss": 0.7088, "step": 3190 }, { "epoch": 1.3277955437693367, "grad_norm": 71.53970336914062, "learning_rate": 1.2291301291281359e-05, "loss": 0.7977, "step": 3191 }, { "epoch": 1.3282115279619373, "grad_norm": 448.24334716796875, "learning_rate": 1.2286939947518393e-05, "loss": 0.8973, "step": 3192 }, { "epoch": 1.3286275121545381, "grad_norm": 1.6080788373947144, "learning_rate": 1.2282578144694127e-05, "loss": 0.8012, "step": 3193 }, { "epoch": 1.329043496347139, "grad_norm": 1.8438868522644043, "learning_rate": 1.2278215883684105e-05, "loss": 0.9226, "step": 3194 }, { "epoch": 1.3294594805397395, "grad_norm": 763.3768310546875, "learning_rate": 1.2273853165363983e-05, "loss": 0.8269, "step": 3195 }, { "epoch": 1.32987546473234, "grad_norm": 1.8094416856765747, "learning_rate": 1.2269489990609487e-05, "loss": 0.8876, "step": 3196 }, { "epoch": 1.330291448924941, "grad_norm": 1.8127282857894897, "learning_rate": 1.2265126360296448e-05, "loss": 0.8718, "step": 3197 }, { "epoch": 1.3307074331175415, "grad_norm": 4.914704322814941, "learning_rate": 1.2260762275300787e-05, "loss": 0.7425, "step": 3198 }, { "epoch": 1.3311234173101423, "grad_norm": 1.7146259546279907, "learning_rate": 1.2256397736498514e-05, "loss": 0.8743, "step": 3199 }, { "epoch": 1.3315394015027429, "grad_norm": 1.681522011756897, "learning_rate": 1.2252032744765725e-05, "loss": 0.8647, "step": 3200 }, { "epoch": 1.3319553856953434, "grad_norm": 1.79880690574646, "learning_rate": 1.2247667300978619e-05, "loss": 0.8012, "step": 3201 }, { "epoch": 1.3323713698879442, "grad_norm": 1.930137038230896, "learning_rate": 1.2243301406013472e-05, "loss": 0.9636, "step": 3202 }, { "epoch": 1.332787354080545, "grad_norm": 1.7099188566207886, "learning_rate": 1.2238935060746667e-05, "loss": 0.8316, "step": 3203 }, { "epoch": 1.3332033382731456, "grad_norm": 1.6591817140579224, "learning_rate": 1.2234568266054657e-05, "loss": 0.8311, "step": 3204 }, { "epoch": 1.3336193224657462, "grad_norm": 1.8415066003799438, "learning_rate": 1.2230201022814002e-05, "loss": 0.8986, "step": 3205 }, { "epoch": 1.334035306658347, "grad_norm": 1.7343860864639282, "learning_rate": 1.2225833331901344e-05, "loss": 0.7943, "step": 3206 }, { "epoch": 1.3344512908509476, "grad_norm": 1.76491117477417, "learning_rate": 1.2221465194193419e-05, "loss": 0.8857, "step": 3207 }, { "epoch": 1.3348672750435484, "grad_norm": 1.7468467950820923, "learning_rate": 1.221709661056705e-05, "loss": 0.8699, "step": 3208 }, { "epoch": 1.335283259236149, "grad_norm": 1.677950382232666, "learning_rate": 1.221272758189915e-05, "loss": 0.6675, "step": 3209 }, { "epoch": 1.3356992434287496, "grad_norm": 3.8404366970062256, "learning_rate": 1.2208358109066719e-05, "loss": 0.8411, "step": 3210 }, { "epoch": 1.3361152276213504, "grad_norm": 1.576583743095398, "learning_rate": 1.2203988192946849e-05, "loss": 0.7587, "step": 3211 }, { "epoch": 1.3365312118139512, "grad_norm": 1.6889424324035645, "learning_rate": 1.2199617834416726e-05, "loss": 0.8054, "step": 3212 }, { "epoch": 1.3369471960065518, "grad_norm": 1.6082408428192139, "learning_rate": 1.2195247034353615e-05, "loss": 0.8167, "step": 3213 }, { "epoch": 1.3373631801991523, "grad_norm": 1.5482686758041382, "learning_rate": 1.2190875793634873e-05, "loss": 0.8486, "step": 3214 }, { "epoch": 1.3377791643917532, "grad_norm": 1.858090877532959, "learning_rate": 1.2186504113137952e-05, "loss": 0.958, "step": 3215 }, { "epoch": 1.3381951485843537, "grad_norm": 1.7683535814285278, "learning_rate": 1.218213199374038e-05, "loss": 0.8097, "step": 3216 }, { "epoch": 1.3386111327769545, "grad_norm": 1.6828280687332153, "learning_rate": 1.2177759436319785e-05, "loss": 0.9018, "step": 3217 }, { "epoch": 1.3390271169695551, "grad_norm": 1.7598462104797363, "learning_rate": 1.2173386441753877e-05, "loss": 0.8893, "step": 3218 }, { "epoch": 1.3394431011621557, "grad_norm": 1.5786265134811401, "learning_rate": 1.2169013010920449e-05, "loss": 0.7959, "step": 3219 }, { "epoch": 1.3398590853547565, "grad_norm": 1.7276381254196167, "learning_rate": 1.2164639144697392e-05, "loss": 0.93, "step": 3220 }, { "epoch": 1.3402750695473573, "grad_norm": 1.711076021194458, "learning_rate": 1.216026484396268e-05, "loss": 0.9072, "step": 3221 }, { "epoch": 1.340691053739958, "grad_norm": 1.7484898567199707, "learning_rate": 1.2155890109594375e-05, "loss": 0.7866, "step": 3222 }, { "epoch": 1.3411070379325585, "grad_norm": 1.6466158628463745, "learning_rate": 1.2151514942470623e-05, "loss": 0.8622, "step": 3223 }, { "epoch": 1.3415230221251593, "grad_norm": 1.5904916524887085, "learning_rate": 1.2147139343469655e-05, "loss": 0.8688, "step": 3224 }, { "epoch": 1.3419390063177599, "grad_norm": 1.7714321613311768, "learning_rate": 1.2142763313469796e-05, "loss": 0.7705, "step": 3225 }, { "epoch": 1.3423549905103607, "grad_norm": 1.6327241659164429, "learning_rate": 1.2138386853349455e-05, "loss": 0.7671, "step": 3226 }, { "epoch": 1.3427709747029613, "grad_norm": 1.8168522119522095, "learning_rate": 1.2134009963987124e-05, "loss": 0.8127, "step": 3227 }, { "epoch": 1.3431869588955618, "grad_norm": 1.6618901491165161, "learning_rate": 1.2129632646261382e-05, "loss": 0.8017, "step": 3228 }, { "epoch": 1.3436029430881626, "grad_norm": 1.6346079111099243, "learning_rate": 1.21252549010509e-05, "loss": 0.8218, "step": 3229 }, { "epoch": 1.3440189272807634, "grad_norm": 1.799862027168274, "learning_rate": 1.2120876729234426e-05, "loss": 0.9256, "step": 3230 }, { "epoch": 1.344434911473364, "grad_norm": 2.9612085819244385, "learning_rate": 1.2116498131690799e-05, "loss": 0.9414, "step": 3231 }, { "epoch": 1.3448508956659646, "grad_norm": 1.6206235885620117, "learning_rate": 1.2112119109298945e-05, "loss": 0.8428, "step": 3232 }, { "epoch": 1.3452668798585654, "grad_norm": 1.5976951122283936, "learning_rate": 1.2107739662937867e-05, "loss": 0.8899, "step": 3233 }, { "epoch": 1.345682864051166, "grad_norm": 1.653639793395996, "learning_rate": 1.2103359793486665e-05, "loss": 0.896, "step": 3234 }, { "epoch": 1.3460988482437668, "grad_norm": 1.811457872390747, "learning_rate": 1.209897950182451e-05, "loss": 0.7565, "step": 3235 }, { "epoch": 1.3465148324363674, "grad_norm": 1.7789820432662964, "learning_rate": 1.2094598788830671e-05, "loss": 0.8319, "step": 3236 }, { "epoch": 1.346930816628968, "grad_norm": 1.6820727586746216, "learning_rate": 1.2090217655384497e-05, "loss": 0.8852, "step": 3237 }, { "epoch": 1.3473468008215688, "grad_norm": 1.7226835489273071, "learning_rate": 1.2085836102365416e-05, "loss": 0.7449, "step": 3238 }, { "epoch": 1.3477627850141696, "grad_norm": 1.721043586730957, "learning_rate": 1.2081454130652945e-05, "loss": 0.8858, "step": 3239 }, { "epoch": 1.3481787692067702, "grad_norm": 1.7208377122879028, "learning_rate": 1.2077071741126687e-05, "loss": 0.8342, "step": 3240 }, { "epoch": 1.3485947533993707, "grad_norm": 1.747453212738037, "learning_rate": 1.2072688934666326e-05, "loss": 0.893, "step": 3241 }, { "epoch": 1.3490107375919715, "grad_norm": 1.790622353553772, "learning_rate": 1.2068305712151627e-05, "loss": 0.8628, "step": 3242 }, { "epoch": 1.3494267217845721, "grad_norm": 1.4810261726379395, "learning_rate": 1.2063922074462444e-05, "loss": 0.85, "step": 3243 }, { "epoch": 1.349842705977173, "grad_norm": 11.390076637268066, "learning_rate": 1.2059538022478714e-05, "loss": 0.8467, "step": 3244 }, { "epoch": 1.3502586901697735, "grad_norm": 1.6219323873519897, "learning_rate": 1.205515355708045e-05, "loss": 0.7575, "step": 3245 }, { "epoch": 1.350674674362374, "grad_norm": 1.684889316558838, "learning_rate": 1.205076867914776e-05, "loss": 0.8147, "step": 3246 }, { "epoch": 1.351090658554975, "grad_norm": 1.8221209049224854, "learning_rate": 1.2046383389560822e-05, "loss": 0.9617, "step": 3247 }, { "epoch": 1.3515066427475757, "grad_norm": 1.7320802211761475, "learning_rate": 1.2041997689199905e-05, "loss": 0.8705, "step": 3248 }, { "epoch": 1.3519226269401763, "grad_norm": 1.651597261428833, "learning_rate": 1.2037611578945359e-05, "loss": 0.7797, "step": 3249 }, { "epoch": 1.3523386111327769, "grad_norm": 1.6409832239151, "learning_rate": 1.2033225059677612e-05, "loss": 0.828, "step": 3250 }, { "epoch": 1.3527545953253777, "grad_norm": 1.685723900794983, "learning_rate": 1.2028838132277178e-05, "loss": 0.8012, "step": 3251 }, { "epoch": 1.3531705795179783, "grad_norm": 1.6543478965759277, "learning_rate": 1.2024450797624658e-05, "loss": 0.7735, "step": 3252 }, { "epoch": 1.353586563710579, "grad_norm": 1.6242531538009644, "learning_rate": 1.2020063056600721e-05, "loss": 0.8917, "step": 3253 }, { "epoch": 1.3540025479031796, "grad_norm": 1.6728644371032715, "learning_rate": 1.201567491008613e-05, "loss": 0.8695, "step": 3254 }, { "epoch": 1.3544185320957802, "grad_norm": 1.6336984634399414, "learning_rate": 1.2011286358961724e-05, "loss": 0.803, "step": 3255 }, { "epoch": 1.354834516288381, "grad_norm": 746924.0625, "learning_rate": 1.2006897404108427e-05, "loss": 0.8265, "step": 3256 }, { "epoch": 1.3552505004809818, "grad_norm": 1.8473355770111084, "learning_rate": 1.2002508046407237e-05, "loss": 0.8761, "step": 3257 }, { "epoch": 1.3556664846735824, "grad_norm": 1.7106256484985352, "learning_rate": 1.1998118286739237e-05, "loss": 0.8845, "step": 3258 }, { "epoch": 1.356082468866183, "grad_norm": 1.6167523860931396, "learning_rate": 1.1993728125985595e-05, "loss": 0.8039, "step": 3259 }, { "epoch": 1.3564984530587838, "grad_norm": 1.5367319583892822, "learning_rate": 1.1989337565027554e-05, "loss": 0.8487, "step": 3260 }, { "epoch": 1.3569144372513844, "grad_norm": 1.7282919883728027, "learning_rate": 1.1984946604746437e-05, "loss": 0.7881, "step": 3261 }, { "epoch": 1.3573304214439852, "grad_norm": 1.6245832443237305, "learning_rate": 1.198055524602365e-05, "loss": 0.854, "step": 3262 }, { "epoch": 1.3577464056365858, "grad_norm": 1.760988712310791, "learning_rate": 1.1976163489740677e-05, "loss": 0.8248, "step": 3263 }, { "epoch": 1.3581623898291864, "grad_norm": 1.7285162210464478, "learning_rate": 1.1971771336779084e-05, "loss": 0.8464, "step": 3264 }, { "epoch": 1.3585783740217872, "grad_norm": 1.6190423965454102, "learning_rate": 1.1967378788020514e-05, "loss": 0.7533, "step": 3265 }, { "epoch": 1.358994358214388, "grad_norm": 1.6729800701141357, "learning_rate": 1.196298584434669e-05, "loss": 0.846, "step": 3266 }, { "epoch": 1.3594103424069885, "grad_norm": 1.5575709342956543, "learning_rate": 1.195859250663942e-05, "loss": 0.8199, "step": 3267 }, { "epoch": 1.3598263265995891, "grad_norm": 1.7124241590499878, "learning_rate": 1.195419877578058e-05, "loss": 0.8592, "step": 3268 }, { "epoch": 1.36024231079219, "grad_norm": 1.668155550956726, "learning_rate": 1.1949804652652133e-05, "loss": 0.8223, "step": 3269 }, { "epoch": 1.3606582949847905, "grad_norm": 2.505537271499634, "learning_rate": 1.1945410138136118e-05, "loss": 0.7268, "step": 3270 }, { "epoch": 1.3610742791773913, "grad_norm": 1.6765656471252441, "learning_rate": 1.1941015233114654e-05, "loss": 0.8141, "step": 3271 }, { "epoch": 1.361490263369992, "grad_norm": 1.8780953884124756, "learning_rate": 1.193661993846994e-05, "loss": 0.7776, "step": 3272 }, { "epoch": 1.3619062475625925, "grad_norm": 1.7969906330108643, "learning_rate": 1.1932224255084246e-05, "loss": 0.8454, "step": 3273 }, { "epoch": 1.3623222317551933, "grad_norm": 1.7274523973464966, "learning_rate": 1.1927828183839929e-05, "loss": 0.844, "step": 3274 }, { "epoch": 1.362738215947794, "grad_norm": 1.7319988012313843, "learning_rate": 1.1923431725619419e-05, "loss": 0.9495, "step": 3275 }, { "epoch": 1.3631542001403947, "grad_norm": 1.6154650449752808, "learning_rate": 1.1919034881305225e-05, "loss": 0.8071, "step": 3276 }, { "epoch": 1.3635701843329953, "grad_norm": 1.7799766063690186, "learning_rate": 1.191463765177993e-05, "loss": 0.8957, "step": 3277 }, { "epoch": 1.363986168525596, "grad_norm": 1.6431139707565308, "learning_rate": 1.1910240037926198e-05, "loss": 0.8451, "step": 3278 }, { "epoch": 1.3644021527181966, "grad_norm": 1.6830350160598755, "learning_rate": 1.1905842040626772e-05, "loss": 0.7635, "step": 3279 }, { "epoch": 1.3648181369107975, "grad_norm": 1.7440484762191772, "learning_rate": 1.1901443660764466e-05, "loss": 0.8397, "step": 3280 }, { "epoch": 1.365234121103398, "grad_norm": 1.5774896144866943, "learning_rate": 1.189704489922218e-05, "loss": 0.8043, "step": 3281 }, { "epoch": 1.3656501052959986, "grad_norm": 1.577197790145874, "learning_rate": 1.1892645756882882e-05, "loss": 0.8188, "step": 3282 }, { "epoch": 1.3660660894885994, "grad_norm": 1.7126047611236572, "learning_rate": 1.1888246234629612e-05, "loss": 0.861, "step": 3283 }, { "epoch": 1.3664820736812002, "grad_norm": 1.77933669090271, "learning_rate": 1.1883846333345504e-05, "loss": 0.8691, "step": 3284 }, { "epoch": 1.3668980578738008, "grad_norm": 1.845171332359314, "learning_rate": 1.1879446053913753e-05, "loss": 0.8906, "step": 3285 }, { "epoch": 1.3673140420664014, "grad_norm": 1.6328331232070923, "learning_rate": 1.1875045397217632e-05, "loss": 0.8008, "step": 3286 }, { "epoch": 1.3677300262590022, "grad_norm": 1.7114956378936768, "learning_rate": 1.1870644364140497e-05, "loss": 0.9144, "step": 3287 }, { "epoch": 1.3681460104516028, "grad_norm": 1.6127088069915771, "learning_rate": 1.1866242955565772e-05, "loss": 0.7755, "step": 3288 }, { "epoch": 1.3685619946442036, "grad_norm": 1.8183211088180542, "learning_rate": 1.186184117237696e-05, "loss": 0.949, "step": 3289 }, { "epoch": 1.3689779788368042, "grad_norm": 1.7281105518341064, "learning_rate": 1.1857439015457639e-05, "loss": 0.8163, "step": 3290 }, { "epoch": 1.3693939630294047, "grad_norm": 1.7079317569732666, "learning_rate": 1.1853036485691462e-05, "loss": 0.8758, "step": 3291 }, { "epoch": 1.3698099472220056, "grad_norm": 1.7812970876693726, "learning_rate": 1.1848633583962153e-05, "loss": 0.9753, "step": 3292 }, { "epoch": 1.3702259314146064, "grad_norm": 1.7927016019821167, "learning_rate": 1.1844230311153516e-05, "loss": 0.9034, "step": 3293 }, { "epoch": 1.370641915607207, "grad_norm": 1.7335281372070312, "learning_rate": 1.1839826668149425e-05, "loss": 0.8508, "step": 3294 }, { "epoch": 1.3710578997998075, "grad_norm": 1.860644817352295, "learning_rate": 1.1835422655833835e-05, "loss": 0.8898, "step": 3295 }, { "epoch": 1.3714738839924083, "grad_norm": 1.6406930685043335, "learning_rate": 1.1831018275090768e-05, "loss": 0.8335, "step": 3296 }, { "epoch": 1.371889868185009, "grad_norm": 1.6571882963180542, "learning_rate": 1.1826613526804325e-05, "loss": 0.728, "step": 3297 }, { "epoch": 1.3723058523776097, "grad_norm": 2.643679141998291, "learning_rate": 1.1822208411858673e-05, "loss": 0.8726, "step": 3298 }, { "epoch": 1.3727218365702103, "grad_norm": 1.6046013832092285, "learning_rate": 1.1817802931138065e-05, "loss": 0.8082, "step": 3299 }, { "epoch": 1.3731378207628109, "grad_norm": 2.5325286388397217, "learning_rate": 1.1813397085526814e-05, "loss": 0.9445, "step": 3300 }, { "epoch": 1.3735538049554117, "grad_norm": 1.7168327569961548, "learning_rate": 1.1808990875909318e-05, "loss": 0.8451, "step": 3301 }, { "epoch": 1.3739697891480125, "grad_norm": 1.7897043228149414, "learning_rate": 1.1804584303170046e-05, "loss": 0.8287, "step": 3302 }, { "epoch": 1.374385773340613, "grad_norm": 1.8544138669967651, "learning_rate": 1.1800177368193526e-05, "loss": 0.8747, "step": 3303 }, { "epoch": 1.3748017575332137, "grad_norm": 1.7850793600082397, "learning_rate": 1.1795770071864374e-05, "loss": 0.9285, "step": 3304 }, { "epoch": 1.3752177417258145, "grad_norm": 1.6382795572280884, "learning_rate": 1.1791362415067277e-05, "loss": 0.7886, "step": 3305 }, { "epoch": 1.375633725918415, "grad_norm": 1.8786903619766235, "learning_rate": 1.1786954398686994e-05, "loss": 0.9152, "step": 3306 }, { "epoch": 1.3760497101110158, "grad_norm": 1.563004732131958, "learning_rate": 1.1782546023608345e-05, "loss": 0.8082, "step": 3307 }, { "epoch": 1.3764656943036164, "grad_norm": 1.6333357095718384, "learning_rate": 1.1778137290716235e-05, "loss": 0.8646, "step": 3308 }, { "epoch": 1.376881678496217, "grad_norm": 1.68541419506073, "learning_rate": 1.1773728200895638e-05, "loss": 0.8548, "step": 3309 }, { "epoch": 1.3772976626888178, "grad_norm": 1.586358666419983, "learning_rate": 1.1769318755031597e-05, "loss": 0.789, "step": 3310 }, { "epoch": 1.3777136468814186, "grad_norm": 9.49636173248291, "learning_rate": 1.1764908954009226e-05, "loss": 0.9337, "step": 3311 }, { "epoch": 1.3781296310740192, "grad_norm": 1.7375460863113403, "learning_rate": 1.176049879871371e-05, "loss": 0.8236, "step": 3312 }, { "epoch": 1.3785456152666198, "grad_norm": 1.627481460571289, "learning_rate": 1.1756088290030312e-05, "loss": 0.8082, "step": 3313 }, { "epoch": 1.3789615994592206, "grad_norm": 1.764959454536438, "learning_rate": 1.1751677428844355e-05, "loss": 0.8301, "step": 3314 }, { "epoch": 1.3793775836518212, "grad_norm": 1.6638269424438477, "learning_rate": 1.1747266216041244e-05, "loss": 0.7985, "step": 3315 }, { "epoch": 1.379793567844422, "grad_norm": 1.668146014213562, "learning_rate": 1.1742854652506449e-05, "loss": 0.8689, "step": 3316 }, { "epoch": 1.3802095520370226, "grad_norm": 1.6397725343704224, "learning_rate": 1.1738442739125504e-05, "loss": 0.7837, "step": 3317 }, { "epoch": 1.3806255362296231, "grad_norm": 229.32080078125, "learning_rate": 1.1734030476784026e-05, "loss": 0.9298, "step": 3318 }, { "epoch": 1.381041520422224, "grad_norm": 1.8013699054718018, "learning_rate": 1.1729617866367692e-05, "loss": 0.844, "step": 3319 }, { "epoch": 1.3814575046148247, "grad_norm": 1.7423588037490845, "learning_rate": 1.1725204908762257e-05, "loss": 0.876, "step": 3320 }, { "epoch": 1.3818734888074253, "grad_norm": 1.7932168245315552, "learning_rate": 1.1720791604853543e-05, "loss": 0.9821, "step": 3321 }, { "epoch": 1.382289473000026, "grad_norm": 1.7218276262283325, "learning_rate": 1.1716377955527431e-05, "loss": 0.7541, "step": 3322 }, { "epoch": 1.3827054571926267, "grad_norm": 1.83295738697052, "learning_rate": 1.1711963961669888e-05, "loss": 0.8545, "step": 3323 }, { "epoch": 1.3831214413852273, "grad_norm": 1.8732388019561768, "learning_rate": 1.1707549624166939e-05, "loss": 0.7884, "step": 3324 }, { "epoch": 1.383537425577828, "grad_norm": 1.7159109115600586, "learning_rate": 1.1703134943904681e-05, "loss": 0.8336, "step": 3325 }, { "epoch": 1.3839534097704287, "grad_norm": 1.9117871522903442, "learning_rate": 1.1698719921769284e-05, "loss": 0.9454, "step": 3326 }, { "epoch": 1.3843693939630293, "grad_norm": 1.6540740728378296, "learning_rate": 1.1694304558646979e-05, "loss": 0.8222, "step": 3327 }, { "epoch": 1.38478537815563, "grad_norm": 1.649117112159729, "learning_rate": 1.168988885542407e-05, "loss": 0.8777, "step": 3328 }, { "epoch": 1.3852013623482309, "grad_norm": 1.766449213027954, "learning_rate": 1.1685472812986926e-05, "loss": 0.8253, "step": 3329 }, { "epoch": 1.3856173465408315, "grad_norm": 1.5935338735580444, "learning_rate": 1.1681056432221994e-05, "loss": 0.9061, "step": 3330 }, { "epoch": 1.386033330733432, "grad_norm": 1.8459984064102173, "learning_rate": 1.1676639714015779e-05, "loss": 0.8581, "step": 3331 }, { "epoch": 1.3864493149260328, "grad_norm": 1.5703468322753906, "learning_rate": 1.167222265925485e-05, "loss": 0.7774, "step": 3332 }, { "epoch": 1.3868652991186334, "grad_norm": 1.7393828630447388, "learning_rate": 1.1667805268825854e-05, "loss": 0.9017, "step": 3333 }, { "epoch": 1.3872812833112342, "grad_norm": 1.73499596118927, "learning_rate": 1.16633875436155e-05, "loss": 0.8088, "step": 3334 }, { "epoch": 1.3876972675038348, "grad_norm": 1.6586687564849854, "learning_rate": 1.1658969484510574e-05, "loss": 0.782, "step": 3335 }, { "epoch": 1.3881132516964354, "grad_norm": 1.6680960655212402, "learning_rate": 1.1654551092397908e-05, "loss": 0.8624, "step": 3336 }, { "epoch": 1.3885292358890362, "grad_norm": 1.8353549242019653, "learning_rate": 1.165013236816442e-05, "loss": 0.8663, "step": 3337 }, { "epoch": 1.388945220081637, "grad_norm": 1.7096447944641113, "learning_rate": 1.1645713312697085e-05, "loss": 0.8485, "step": 3338 }, { "epoch": 1.3893612042742376, "grad_norm": 1.9561278820037842, "learning_rate": 1.164129392688295e-05, "loss": 0.9746, "step": 3339 }, { "epoch": 1.3897771884668382, "grad_norm": 1.7121778726577759, "learning_rate": 1.1636874211609126e-05, "loss": 0.757, "step": 3340 }, { "epoch": 1.390193172659439, "grad_norm": 1.6272286176681519, "learning_rate": 1.1632454167762788e-05, "loss": 0.8712, "step": 3341 }, { "epoch": 1.3906091568520396, "grad_norm": 1.6046693325042725, "learning_rate": 1.162803379623118e-05, "loss": 0.7568, "step": 3342 }, { "epoch": 1.3910251410446404, "grad_norm": 1.8391482830047607, "learning_rate": 1.1623613097901608e-05, "loss": 0.7797, "step": 3343 }, { "epoch": 1.391441125237241, "grad_norm": 1.814605712890625, "learning_rate": 1.161919207366145e-05, "loss": 0.7702, "step": 3344 }, { "epoch": 1.3918571094298415, "grad_norm": 1.6438552141189575, "learning_rate": 1.1614770724398144e-05, "loss": 0.8544, "step": 3345 }, { "epoch": 1.3922730936224423, "grad_norm": 1.7802634239196777, "learning_rate": 1.1610349050999195e-05, "loss": 0.8697, "step": 3346 }, { "epoch": 1.3926890778150431, "grad_norm": 1.7351130247116089, "learning_rate": 1.160592705435217e-05, "loss": 0.8195, "step": 3347 }, { "epoch": 1.3931050620076437, "grad_norm": 1.5693646669387817, "learning_rate": 1.1601504735344709e-05, "loss": 0.8156, "step": 3348 }, { "epoch": 1.3935210462002443, "grad_norm": 1.6064867973327637, "learning_rate": 1.1597082094864507e-05, "loss": 0.7255, "step": 3349 }, { "epoch": 1.3939370303928451, "grad_norm": 1.8421889543533325, "learning_rate": 1.1592659133799333e-05, "loss": 0.7891, "step": 3350 }, { "epoch": 1.3943530145854457, "grad_norm": 1.7015904188156128, "learning_rate": 1.1588235853037011e-05, "loss": 0.8396, "step": 3351 }, { "epoch": 1.3947689987780465, "grad_norm": 1.593672752380371, "learning_rate": 1.1583812253465435e-05, "loss": 0.8507, "step": 3352 }, { "epoch": 1.395184982970647, "grad_norm": 1.751320242881775, "learning_rate": 1.1579388335972561e-05, "loss": 0.7616, "step": 3353 }, { "epoch": 1.3956009671632477, "grad_norm": 1.841142177581787, "learning_rate": 1.157496410144641e-05, "loss": 0.9131, "step": 3354 }, { "epoch": 1.3960169513558485, "grad_norm": 1.6436840295791626, "learning_rate": 1.157053955077507e-05, "loss": 0.7628, "step": 3355 }, { "epoch": 1.3964329355484493, "grad_norm": 1.6278271675109863, "learning_rate": 1.1566114684846681e-05, "loss": 0.8789, "step": 3356 }, { "epoch": 1.3968489197410499, "grad_norm": 1.740466833114624, "learning_rate": 1.1561689504549457e-05, "loss": 0.8331, "step": 3357 }, { "epoch": 1.3972649039336504, "grad_norm": 1.6635980606079102, "learning_rate": 1.1557264010771676e-05, "loss": 0.7627, "step": 3358 }, { "epoch": 1.3976808881262512, "grad_norm": 1.5655796527862549, "learning_rate": 1.1552838204401669e-05, "loss": 0.7752, "step": 3359 }, { "epoch": 1.3980968723188518, "grad_norm": 1.7913703918457031, "learning_rate": 1.1548412086327837e-05, "loss": 0.9472, "step": 3360 }, { "epoch": 1.3985128565114526, "grad_norm": 1.7654287815093994, "learning_rate": 1.1543985657438643e-05, "loss": 0.8038, "step": 3361 }, { "epoch": 1.3989288407040532, "grad_norm": 8.621331214904785, "learning_rate": 1.153955891862261e-05, "loss": 0.8833, "step": 3362 }, { "epoch": 1.3993448248966538, "grad_norm": 1.7887089252471924, "learning_rate": 1.1535131870768327e-05, "loss": 0.8875, "step": 3363 }, { "epoch": 1.3997608090892546, "grad_norm": 1.8083598613739014, "learning_rate": 1.1530704514764443e-05, "loss": 0.9072, "step": 3364 }, { "epoch": 1.4001767932818554, "grad_norm": 1.732338309288025, "learning_rate": 1.1526276851499666e-05, "loss": 0.8498, "step": 3365 }, { "epoch": 1.400592777474456, "grad_norm": 25.441495895385742, "learning_rate": 1.1521848881862771e-05, "loss": 0.7204, "step": 3366 }, { "epoch": 1.4010087616670566, "grad_norm": 1.7298624515533447, "learning_rate": 1.1517420606742591e-05, "loss": 0.9191, "step": 3367 }, { "epoch": 1.4014247458596574, "grad_norm": 1.8142303228378296, "learning_rate": 1.1512992027028018e-05, "loss": 0.7971, "step": 3368 }, { "epoch": 1.401840730052258, "grad_norm": 1.8617173433303833, "learning_rate": 1.1508563143608011e-05, "loss": 0.835, "step": 3369 }, { "epoch": 1.4022567142448588, "grad_norm": 1.6954010725021362, "learning_rate": 1.1504133957371593e-05, "loss": 0.7433, "step": 3370 }, { "epoch": 1.4026726984374593, "grad_norm": 162.55580139160156, "learning_rate": 1.1499704469207832e-05, "loss": 0.9309, "step": 3371 }, { "epoch": 1.40308868263006, "grad_norm": 5.919766426086426, "learning_rate": 1.1495274680005869e-05, "loss": 0.8239, "step": 3372 }, { "epoch": 1.4035046668226607, "grad_norm": 1.6593793630599976, "learning_rate": 1.1490844590654904e-05, "loss": 0.7749, "step": 3373 }, { "epoch": 1.4039206510152615, "grad_norm": 1.6513994932174683, "learning_rate": 1.1486414202044198e-05, "loss": 0.7195, "step": 3374 }, { "epoch": 1.4043366352078621, "grad_norm": 1.599690318107605, "learning_rate": 1.1481983515063073e-05, "loss": 0.8911, "step": 3375 }, { "epoch": 1.4047526194004627, "grad_norm": 1.8188724517822266, "learning_rate": 1.1477552530600902e-05, "loss": 0.9628, "step": 3376 }, { "epoch": 1.4051686035930635, "grad_norm": 1.7412868738174438, "learning_rate": 1.1473121249547127e-05, "loss": 0.8649, "step": 3377 }, { "epoch": 1.405584587785664, "grad_norm": 1.640321969985962, "learning_rate": 1.1468689672791244e-05, "loss": 0.9125, "step": 3378 }, { "epoch": 1.406000571978265, "grad_norm": 1.7524110078811646, "learning_rate": 1.1464257801222816e-05, "loss": 0.9105, "step": 3379 }, { "epoch": 1.4064165561708655, "grad_norm": 1.7377859354019165, "learning_rate": 1.1459825635731454e-05, "loss": 1.0383, "step": 3380 }, { "epoch": 1.406832540363466, "grad_norm": 1.8211041688919067, "learning_rate": 1.145539317720684e-05, "loss": 0.8324, "step": 3381 }, { "epoch": 1.4072485245560669, "grad_norm": 1.9241125583648682, "learning_rate": 1.1450960426538704e-05, "loss": 0.914, "step": 3382 }, { "epoch": 1.4076645087486677, "grad_norm": 9.989706993103027, "learning_rate": 1.1446527384616841e-05, "loss": 0.8783, "step": 3383 }, { "epoch": 1.4080804929412682, "grad_norm": 1.6318340301513672, "learning_rate": 1.1442094052331104e-05, "loss": 0.9056, "step": 3384 }, { "epoch": 1.4084964771338688, "grad_norm": 1.764053463935852, "learning_rate": 1.1437660430571404e-05, "loss": 0.8557, "step": 3385 }, { "epoch": 1.4089124613264696, "grad_norm": 1.8213231563568115, "learning_rate": 1.1433226520227708e-05, "loss": 0.8171, "step": 3386 }, { "epoch": 1.4093284455190702, "grad_norm": 202.2362518310547, "learning_rate": 1.1428792322190038e-05, "loss": 0.7542, "step": 3387 }, { "epoch": 1.409744429711671, "grad_norm": 60.44099807739258, "learning_rate": 1.1424357837348485e-05, "loss": 0.8191, "step": 3388 }, { "epoch": 1.4101604139042716, "grad_norm": 1.7179484367370605, "learning_rate": 1.1419923066593188e-05, "loss": 0.8282, "step": 3389 }, { "epoch": 1.4105763980968722, "grad_norm": 1.9065793752670288, "learning_rate": 1.1415488010814349e-05, "loss": 0.886, "step": 3390 }, { "epoch": 1.410992382289473, "grad_norm": 1.8095626831054688, "learning_rate": 1.1411052670902216e-05, "loss": 0.7949, "step": 3391 }, { "epoch": 1.4114083664820738, "grad_norm": 1.6551623344421387, "learning_rate": 1.140661704774711e-05, "loss": 0.7843, "step": 3392 }, { "epoch": 1.4118243506746744, "grad_norm": 1.6826456785202026, "learning_rate": 1.1402181142239397e-05, "loss": 0.6995, "step": 3393 }, { "epoch": 1.412240334867275, "grad_norm": 1.7236119508743286, "learning_rate": 1.1397744955269505e-05, "loss": 0.9064, "step": 3394 }, { "epoch": 1.4126563190598758, "grad_norm": 1.5656691789627075, "learning_rate": 1.1393308487727919e-05, "loss": 0.7809, "step": 3395 }, { "epoch": 1.4130723032524763, "grad_norm": 5.241281032562256, "learning_rate": 1.1388871740505174e-05, "loss": 0.8398, "step": 3396 }, { "epoch": 1.4134882874450772, "grad_norm": 1.7007302045822144, "learning_rate": 1.138443471449187e-05, "loss": 0.7638, "step": 3397 }, { "epoch": 1.4139042716376777, "grad_norm": 1.775807499885559, "learning_rate": 1.1379997410578658e-05, "loss": 0.8212, "step": 3398 }, { "epoch": 1.4143202558302783, "grad_norm": 1.7385549545288086, "learning_rate": 1.1375559829656244e-05, "loss": 0.9501, "step": 3399 }, { "epoch": 1.4147362400228791, "grad_norm": 44.990966796875, "learning_rate": 1.1371121972615395e-05, "loss": 0.8622, "step": 3400 }, { "epoch": 1.41515222421548, "grad_norm": 1.715621829032898, "learning_rate": 1.1366683840346924e-05, "loss": 0.8053, "step": 3401 }, { "epoch": 1.4155682084080805, "grad_norm": 1.7550182342529297, "learning_rate": 1.1362245433741707e-05, "loss": 0.7764, "step": 3402 }, { "epoch": 1.415984192600681, "grad_norm": 1.7360658645629883, "learning_rate": 1.1357806753690675e-05, "loss": 0.8221, "step": 3403 }, { "epoch": 1.416400176793282, "grad_norm": 1.5454105138778687, "learning_rate": 1.1353367801084807e-05, "loss": 0.7564, "step": 3404 }, { "epoch": 1.4168161609858825, "grad_norm": 1.6532119512557983, "learning_rate": 1.1348928576815148e-05, "loss": 0.6836, "step": 3405 }, { "epoch": 1.4172321451784833, "grad_norm": 1.8004025220870972, "learning_rate": 1.1344489081772785e-05, "loss": 0.9608, "step": 3406 }, { "epoch": 1.4176481293710839, "grad_norm": 1.9961071014404297, "learning_rate": 1.1340049316848866e-05, "loss": 0.9398, "step": 3407 }, { "epoch": 1.4180641135636844, "grad_norm": 1.8515350818634033, "learning_rate": 1.1335609282934597e-05, "loss": 0.9172, "step": 3408 }, { "epoch": 1.4184800977562853, "grad_norm": 3.7547597885131836, "learning_rate": 1.1331168980921228e-05, "loss": 0.8044, "step": 3409 }, { "epoch": 1.418896081948886, "grad_norm": 2.602010488510132, "learning_rate": 1.132672841170007e-05, "loss": 0.9792, "step": 3410 }, { "epoch": 1.4193120661414866, "grad_norm": 1.8126206398010254, "learning_rate": 1.1322287576162488e-05, "loss": 0.9119, "step": 3411 }, { "epoch": 1.4197280503340872, "grad_norm": 91.26793670654297, "learning_rate": 1.1317846475199896e-05, "loss": 0.9457, "step": 3412 }, { "epoch": 1.420144034526688, "grad_norm": 1.6740436553955078, "learning_rate": 1.1313405109703766e-05, "loss": 0.7862, "step": 3413 }, { "epoch": 1.4205600187192886, "grad_norm": 8.230948448181152, "learning_rate": 1.1308963480565622e-05, "loss": 0.8091, "step": 3414 }, { "epoch": 1.4209760029118894, "grad_norm": 1.9696598052978516, "learning_rate": 1.1304521588677033e-05, "loss": 0.7777, "step": 3415 }, { "epoch": 1.42139198710449, "grad_norm": 1.668554425239563, "learning_rate": 1.1300079434929634e-05, "loss": 0.7569, "step": 3416 }, { "epoch": 1.4218079712970906, "grad_norm": 1.7045669555664062, "learning_rate": 1.12956370202151e-05, "loss": 0.7742, "step": 3417 }, { "epoch": 1.4222239554896914, "grad_norm": 1.7626690864562988, "learning_rate": 1.1291194345425172e-05, "loss": 0.8187, "step": 3418 }, { "epoch": 1.4226399396822922, "grad_norm": 1.6631094217300415, "learning_rate": 1.1286751411451629e-05, "loss": 0.7012, "step": 3419 }, { "epoch": 1.4230559238748928, "grad_norm": 3.12646746635437, "learning_rate": 1.1282308219186313e-05, "loss": 0.8596, "step": 3420 }, { "epoch": 1.4234719080674934, "grad_norm": 1.6972756385803223, "learning_rate": 1.127786476952111e-05, "loss": 0.7467, "step": 3421 }, { "epoch": 1.4238878922600942, "grad_norm": 17.442726135253906, "learning_rate": 1.1273421063347963e-05, "loss": 0.9317, "step": 3422 }, { "epoch": 1.4243038764526947, "grad_norm": 1.8870056867599487, "learning_rate": 1.1268977101558865e-05, "loss": 0.8374, "step": 3423 }, { "epoch": 1.4247198606452955, "grad_norm": 4988.423828125, "learning_rate": 1.1264532885045858e-05, "loss": 0.8779, "step": 3424 }, { "epoch": 1.4251358448378961, "grad_norm": 1.7671033143997192, "learning_rate": 1.1260088414701039e-05, "loss": 0.7999, "step": 3425 }, { "epoch": 1.4255518290304967, "grad_norm": 1.8489975929260254, "learning_rate": 1.1255643691416553e-05, "loss": 0.9003, "step": 3426 }, { "epoch": 1.4259678132230975, "grad_norm": 1.9185059070587158, "learning_rate": 1.1251198716084597e-05, "loss": 0.9264, "step": 3427 }, { "epoch": 1.4263837974156983, "grad_norm": 2.4117226600646973, "learning_rate": 1.1246753489597419e-05, "loss": 0.8699, "step": 3428 }, { "epoch": 1.426799781608299, "grad_norm": 5.710943222045898, "learning_rate": 1.1242308012847319e-05, "loss": 0.9089, "step": 3429 }, { "epoch": 1.4272157658008995, "grad_norm": 1.9739148616790771, "learning_rate": 1.1237862286726637e-05, "loss": 0.8021, "step": 3430 }, { "epoch": 1.4276317499935003, "grad_norm": 1.836026906967163, "learning_rate": 1.123341631212778e-05, "loss": 0.8703, "step": 3431 }, { "epoch": 1.4280477341861009, "grad_norm": 3.7951018810272217, "learning_rate": 1.122897008994319e-05, "loss": 0.8759, "step": 3432 }, { "epoch": 1.4284637183787017, "grad_norm": 3.030827522277832, "learning_rate": 1.1224523621065369e-05, "loss": 0.8815, "step": 3433 }, { "epoch": 1.4288797025713023, "grad_norm": 1.7816095352172852, "learning_rate": 1.1220076906386864e-05, "loss": 0.8045, "step": 3434 }, { "epoch": 1.4292956867639028, "grad_norm": 1.6802916526794434, "learning_rate": 1.1215629946800266e-05, "loss": 0.7231, "step": 3435 }, { "epoch": 1.4297116709565036, "grad_norm": 1.5465679168701172, "learning_rate": 1.1211182743198224e-05, "loss": 0.6503, "step": 3436 }, { "epoch": 1.4301276551491044, "grad_norm": 1.8673487901687622, "learning_rate": 1.1206735296473434e-05, "loss": 0.8611, "step": 3437 }, { "epoch": 1.430543639341705, "grad_norm": 1.9069832563400269, "learning_rate": 1.1202287607518639e-05, "loss": 0.9384, "step": 3438 }, { "epoch": 1.4309596235343056, "grad_norm": 1.6545531749725342, "learning_rate": 1.119783967722663e-05, "loss": 0.7697, "step": 3439 }, { "epoch": 1.4313756077269064, "grad_norm": 1.713066816329956, "learning_rate": 1.119339150649025e-05, "loss": 0.8253, "step": 3440 }, { "epoch": 1.431791591919507, "grad_norm": 1.6574537754058838, "learning_rate": 1.118894309620238e-05, "loss": 0.9114, "step": 3441 }, { "epoch": 1.4322075761121078, "grad_norm": 1.639400601387024, "learning_rate": 1.1184494447255968e-05, "loss": 0.8821, "step": 3442 }, { "epoch": 1.4326235603047084, "grad_norm": 1.7463164329528809, "learning_rate": 1.118004556054399e-05, "loss": 0.8059, "step": 3443 }, { "epoch": 1.433039544497309, "grad_norm": 6.072800636291504, "learning_rate": 1.1175596436959489e-05, "loss": 0.9269, "step": 3444 }, { "epoch": 1.4334555286899098, "grad_norm": 1.5211864709854126, "learning_rate": 1.1171147077395532e-05, "loss": 0.8366, "step": 3445 }, { "epoch": 1.4338715128825106, "grad_norm": 1.6775379180908203, "learning_rate": 1.1166697482745255e-05, "loss": 0.8434, "step": 3446 }, { "epoch": 1.4342874970751112, "grad_norm": 1.7221052646636963, "learning_rate": 1.1162247653901827e-05, "loss": 0.8302, "step": 3447 }, { "epoch": 1.4347034812677117, "grad_norm": 1.7105885744094849, "learning_rate": 1.1157797591758474e-05, "loss": 0.7779, "step": 3448 }, { "epoch": 1.4351194654603125, "grad_norm": 1.6744506359100342, "learning_rate": 1.1153347297208468e-05, "loss": 0.8285, "step": 3449 }, { "epoch": 1.4355354496529131, "grad_norm": 2.168480396270752, "learning_rate": 1.1148896771145114e-05, "loss": 0.8755, "step": 3450 }, { "epoch": 1.435951433845514, "grad_norm": 2.803731679916382, "learning_rate": 1.114444601446178e-05, "loss": 0.778, "step": 3451 }, { "epoch": 1.4363674180381145, "grad_norm": 2.2046730518341064, "learning_rate": 1.1139995028051873e-05, "loss": 0.8037, "step": 3452 }, { "epoch": 1.436783402230715, "grad_norm": 1.6951879262924194, "learning_rate": 1.1135543812808847e-05, "loss": 0.8225, "step": 3453 }, { "epoch": 1.437199386423316, "grad_norm": 1.765924096107483, "learning_rate": 1.1131092369626204e-05, "loss": 0.7552, "step": 3454 }, { "epoch": 1.4376153706159167, "grad_norm": 1.5742348432540894, "learning_rate": 1.1126640699397484e-05, "loss": 0.77, "step": 3455 }, { "epoch": 1.4380313548085173, "grad_norm": 1.7460052967071533, "learning_rate": 1.1122188803016281e-05, "loss": 0.8606, "step": 3456 }, { "epoch": 1.4384473390011179, "grad_norm": 1.7972495555877686, "learning_rate": 1.1117736681376234e-05, "loss": 0.8547, "step": 3457 }, { "epoch": 1.4388633231937187, "grad_norm": 33.878997802734375, "learning_rate": 1.1113284335371024e-05, "loss": 0.8428, "step": 3458 }, { "epoch": 1.4392793073863193, "grad_norm": 1.8960691690444946, "learning_rate": 1.1108831765894376e-05, "loss": 0.8649, "step": 3459 }, { "epoch": 1.43969529157892, "grad_norm": 1.7051782608032227, "learning_rate": 1.1104378973840064e-05, "loss": 0.9611, "step": 3460 }, { "epoch": 1.4401112757715206, "grad_norm": 1.8561128377914429, "learning_rate": 1.10999259601019e-05, "loss": 0.8617, "step": 3461 }, { "epoch": 1.4405272599641212, "grad_norm": 1.6923381090164185, "learning_rate": 1.1095472725573751e-05, "loss": 0.801, "step": 3462 }, { "epoch": 1.440943244156722, "grad_norm": 1.647582769393921, "learning_rate": 1.1091019271149519e-05, "loss": 0.8682, "step": 3463 }, { "epoch": 1.4413592283493228, "grad_norm": 1.6379098892211914, "learning_rate": 1.1086565597723156e-05, "loss": 0.779, "step": 3464 }, { "epoch": 1.4417752125419234, "grad_norm": 1.6026173830032349, "learning_rate": 1.108211170618865e-05, "loss": 0.8103, "step": 3465 }, { "epoch": 1.442191196734524, "grad_norm": 1.6953967809677124, "learning_rate": 1.1077657597440042e-05, "loss": 0.8235, "step": 3466 }, { "epoch": 1.4426071809271248, "grad_norm": 25.11026382446289, "learning_rate": 1.107320327237141e-05, "loss": 0.9478, "step": 3467 }, { "epoch": 1.4430231651197254, "grad_norm": 1.6188730001449585, "learning_rate": 1.1068748731876884e-05, "loss": 0.854, "step": 3468 }, { "epoch": 1.4434391493123262, "grad_norm": 1.7503541707992554, "learning_rate": 1.1064293976850629e-05, "loss": 0.8142, "step": 3469 }, { "epoch": 1.4438551335049268, "grad_norm": 1.543212890625, "learning_rate": 1.105983900818685e-05, "loss": 0.8527, "step": 3470 }, { "epoch": 1.4442711176975274, "grad_norm": 1.7991067171096802, "learning_rate": 1.1055383826779803e-05, "loss": 0.866, "step": 3471 }, { "epoch": 1.4446871018901282, "grad_norm": 1.7408037185668945, "learning_rate": 1.1050928433523786e-05, "loss": 0.8733, "step": 3472 }, { "epoch": 1.445103086082729, "grad_norm": 1.6338611841201782, "learning_rate": 1.1046472829313142e-05, "loss": 0.7627, "step": 3473 }, { "epoch": 1.4455190702753296, "grad_norm": 1.5815333127975464, "learning_rate": 1.1042017015042243e-05, "loss": 0.84, "step": 3474 }, { "epoch": 1.4459350544679301, "grad_norm": 1.7050094604492188, "learning_rate": 1.1037560991605518e-05, "loss": 0.792, "step": 3475 }, { "epoch": 1.446351038660531, "grad_norm": 1.8159633874893188, "learning_rate": 1.103310475989743e-05, "loss": 0.8162, "step": 3476 }, { "epoch": 1.4467670228531315, "grad_norm": 1.7291629314422607, "learning_rate": 1.1028648320812485e-05, "loss": 0.8001, "step": 3477 }, { "epoch": 1.4471830070457323, "grad_norm": 1.7167755365371704, "learning_rate": 1.1024191675245232e-05, "loss": 0.8641, "step": 3478 }, { "epoch": 1.447598991238333, "grad_norm": 1.8041133880615234, "learning_rate": 1.1019734824090267e-05, "loss": 0.8049, "step": 3479 }, { "epoch": 1.4480149754309335, "grad_norm": 1.6862925291061401, "learning_rate": 1.101527776824221e-05, "loss": 0.8348, "step": 3480 }, { "epoch": 1.4484309596235343, "grad_norm": 10.334649085998535, "learning_rate": 1.101082050859574e-05, "loss": 0.7428, "step": 3481 }, { "epoch": 1.448846943816135, "grad_norm": 1.778448224067688, "learning_rate": 1.1006363046045569e-05, "loss": 0.8403, "step": 3482 }, { "epoch": 1.4492629280087357, "grad_norm": 1.7790279388427734, "learning_rate": 1.1001905381486452e-05, "loss": 0.8522, "step": 3483 }, { "epoch": 1.4496789122013363, "grad_norm": 1.6846436262130737, "learning_rate": 1.0997447515813184e-05, "loss": 0.8209, "step": 3484 }, { "epoch": 1.450094896393937, "grad_norm": 1.7353360652923584, "learning_rate": 1.09929894499206e-05, "loss": 0.8452, "step": 3485 }, { "epoch": 1.4505108805865377, "grad_norm": 1.742790937423706, "learning_rate": 1.0988531184703569e-05, "loss": 0.9465, "step": 3486 }, { "epoch": 1.4509268647791385, "grad_norm": 1.744418740272522, "learning_rate": 1.098407272105701e-05, "loss": 0.8449, "step": 3487 }, { "epoch": 1.451342848971739, "grad_norm": 1.6515634059906006, "learning_rate": 1.0979614059875886e-05, "loss": 0.8334, "step": 3488 }, { "epoch": 1.4517588331643396, "grad_norm": 1.7479606866836548, "learning_rate": 1.097515520205518e-05, "loss": 0.8125, "step": 3489 }, { "epoch": 1.4521748173569404, "grad_norm": 1.7067687511444092, "learning_rate": 1.097069614848993e-05, "loss": 0.7644, "step": 3490 }, { "epoch": 1.4525908015495412, "grad_norm": 1.6615818738937378, "learning_rate": 1.096623690007521e-05, "loss": 0.7945, "step": 3491 }, { "epoch": 1.4530067857421418, "grad_norm": 1.7557023763656616, "learning_rate": 1.0961777457706131e-05, "loss": 0.8666, "step": 3492 }, { "epoch": 1.4534227699347424, "grad_norm": 1.645548701286316, "learning_rate": 1.0957317822277846e-05, "loss": 0.8162, "step": 3493 }, { "epoch": 1.4538387541273432, "grad_norm": 1.5865542888641357, "learning_rate": 1.0952857994685546e-05, "loss": 0.9179, "step": 3494 }, { "epoch": 1.4542547383199438, "grad_norm": 1030.27392578125, "learning_rate": 1.0948397975824456e-05, "loss": 0.8439, "step": 3495 }, { "epoch": 1.4546707225125446, "grad_norm": 1.6312178373336792, "learning_rate": 1.0943937766589847e-05, "loss": 0.7293, "step": 3496 }, { "epoch": 1.4550867067051452, "grad_norm": 5.529782772064209, "learning_rate": 1.093947736787702e-05, "loss": 0.6974, "step": 3497 }, { "epoch": 1.4555026908977458, "grad_norm": 119.37999725341797, "learning_rate": 1.0935016780581325e-05, "loss": 0.6816, "step": 3498 }, { "epoch": 1.4559186750903466, "grad_norm": 4.413853645324707, "learning_rate": 1.0930556005598139e-05, "loss": 0.8648, "step": 3499 }, { "epoch": 1.4563346592829474, "grad_norm": 1.8470813035964966, "learning_rate": 1.0926095043822877e-05, "loss": 0.8364, "step": 3500 }, { "epoch": 1.4563346592829474, "eval_loss": 0.7789405584335327, "eval_runtime": 2114.2101, "eval_samples_per_second": 3.117, "eval_steps_per_second": 1.559, "step": 3500 }, { "epoch": 1.456750643475548, "grad_norm": 1.569601058959961, "learning_rate": 1.0921633896151e-05, "loss": 0.7337, "step": 3501 }, { "epoch": 1.4571666276681485, "grad_norm": 1.9181694984436035, "learning_rate": 1.0917172563478002e-05, "loss": 0.9072, "step": 3502 }, { "epoch": 1.4575826118607493, "grad_norm": 1.932430624961853, "learning_rate": 1.0912711046699416e-05, "loss": 0.7936, "step": 3503 }, { "epoch": 1.45799859605335, "grad_norm": 1.693447232246399, "learning_rate": 1.0908249346710801e-05, "loss": 0.8496, "step": 3504 }, { "epoch": 1.4584145802459507, "grad_norm": 1.5601826906204224, "learning_rate": 1.0903787464407772e-05, "loss": 0.7359, "step": 3505 }, { "epoch": 1.4588305644385513, "grad_norm": 240.7505340576172, "learning_rate": 1.0899325400685963e-05, "loss": 0.8509, "step": 3506 }, { "epoch": 1.4592465486311519, "grad_norm": 1.743048906326294, "learning_rate": 1.0894863156441053e-05, "loss": 0.7759, "step": 3507 }, { "epoch": 1.4596625328237527, "grad_norm": 1.7622259855270386, "learning_rate": 1.089040073256876e-05, "loss": 0.82, "step": 3508 }, { "epoch": 1.4600785170163535, "grad_norm": 1.694217324256897, "learning_rate": 1.0885938129964826e-05, "loss": 0.9235, "step": 3509 }, { "epoch": 1.460494501208954, "grad_norm": 1.8228908777236938, "learning_rate": 1.0881475349525043e-05, "loss": 0.8662, "step": 3510 }, { "epoch": 1.4609104854015547, "grad_norm": 1.8265889883041382, "learning_rate": 1.0877012392145232e-05, "loss": 0.8239, "step": 3511 }, { "epoch": 1.4613264695941555, "grad_norm": 1.7451411485671997, "learning_rate": 1.0872549258721247e-05, "loss": 0.7882, "step": 3512 }, { "epoch": 1.461742453786756, "grad_norm": 3.639049768447876, "learning_rate": 1.0868085950148984e-05, "loss": 0.7137, "step": 3513 }, { "epoch": 1.4621584379793569, "grad_norm": 1.691217064857483, "learning_rate": 1.0863622467324366e-05, "loss": 0.8878, "step": 3514 }, { "epoch": 1.4625744221719574, "grad_norm": 1.8432989120483398, "learning_rate": 1.085915881114336e-05, "loss": 0.8309, "step": 3515 }, { "epoch": 1.462990406364558, "grad_norm": 1.7744942903518677, "learning_rate": 1.0854694982501959e-05, "loss": 0.8881, "step": 3516 }, { "epoch": 1.4634063905571588, "grad_norm": 2.0928657054901123, "learning_rate": 1.0850230982296195e-05, "loss": 0.9526, "step": 3517 }, { "epoch": 1.4638223747497596, "grad_norm": 1.9231020212173462, "learning_rate": 1.0845766811422138e-05, "loss": 0.8685, "step": 3518 }, { "epoch": 1.4642383589423602, "grad_norm": 1.7733045816421509, "learning_rate": 1.0841302470775887e-05, "loss": 0.8833, "step": 3519 }, { "epoch": 1.4646543431349608, "grad_norm": 10.162572860717773, "learning_rate": 1.0836837961253574e-05, "loss": 0.8819, "step": 3520 }, { "epoch": 1.4650703273275616, "grad_norm": 3.3171322345733643, "learning_rate": 1.0832373283751371e-05, "loss": 0.9191, "step": 3521 }, { "epoch": 1.4654863115201622, "grad_norm": 1.7887202501296997, "learning_rate": 1.082790843916548e-05, "loss": 0.931, "step": 3522 }, { "epoch": 1.465902295712763, "grad_norm": 1.681967854499817, "learning_rate": 1.0823443428392137e-05, "loss": 0.8003, "step": 3523 }, { "epoch": 1.4663182799053636, "grad_norm": 1.7972675561904907, "learning_rate": 1.0818978252327607e-05, "loss": 0.8653, "step": 3524 }, { "epoch": 1.4667342640979641, "grad_norm": 4.624447822570801, "learning_rate": 1.0814512911868196e-05, "loss": 0.8525, "step": 3525 }, { "epoch": 1.467150248290565, "grad_norm": 2.1804752349853516, "learning_rate": 1.0810047407910237e-05, "loss": 0.7772, "step": 3526 }, { "epoch": 1.4675662324831658, "grad_norm": 1.8823878765106201, "learning_rate": 1.08055817413501e-05, "loss": 0.8629, "step": 3527 }, { "epoch": 1.4679822166757663, "grad_norm": 1.7391777038574219, "learning_rate": 1.0801115913084189e-05, "loss": 0.7665, "step": 3528 }, { "epoch": 1.468398200868367, "grad_norm": 1.6442970037460327, "learning_rate": 1.0796649924008928e-05, "loss": 0.844, "step": 3529 }, { "epoch": 1.4688141850609677, "grad_norm": 1.834825038909912, "learning_rate": 1.079218377502079e-05, "loss": 0.8464, "step": 3530 }, { "epoch": 1.4692301692535683, "grad_norm": 1.9987727403640747, "learning_rate": 1.0787717467016272e-05, "loss": 0.8667, "step": 3531 }, { "epoch": 1.4696461534461691, "grad_norm": 1.759956955909729, "learning_rate": 1.0783251000891898e-05, "loss": 0.8329, "step": 3532 }, { "epoch": 1.4700621376387697, "grad_norm": 1.6763108968734741, "learning_rate": 1.0778784377544239e-05, "loss": 0.7606, "step": 3533 }, { "epoch": 1.4704781218313703, "grad_norm": 1.6000123023986816, "learning_rate": 1.0774317597869879e-05, "loss": 0.7327, "step": 3534 }, { "epoch": 1.470894106023971, "grad_norm": 1.7009131908416748, "learning_rate": 1.0769850662765444e-05, "loss": 0.8773, "step": 3535 }, { "epoch": 1.4713100902165719, "grad_norm": 1.6947965621948242, "learning_rate": 1.0765383573127592e-05, "loss": 0.8713, "step": 3536 }, { "epoch": 1.4717260744091725, "grad_norm": 1.6282140016555786, "learning_rate": 1.0760916329853011e-05, "loss": 0.8969, "step": 3537 }, { "epoch": 1.472142058601773, "grad_norm": 1.6891472339630127, "learning_rate": 1.0756448933838412e-05, "loss": 0.8966, "step": 3538 }, { "epoch": 1.4725580427943739, "grad_norm": 1.771083116531372, "learning_rate": 1.0751981385980548e-05, "loss": 0.9135, "step": 3539 }, { "epoch": 1.4729740269869744, "grad_norm": 1.6493885517120361, "learning_rate": 1.0747513687176194e-05, "loss": 0.8088, "step": 3540 }, { "epoch": 1.4733900111795752, "grad_norm": 1.790223240852356, "learning_rate": 1.0743045838322159e-05, "loss": 0.8302, "step": 3541 }, { "epoch": 1.4738059953721758, "grad_norm": 21.666290283203125, "learning_rate": 1.0738577840315288e-05, "loss": 0.6894, "step": 3542 }, { "epoch": 1.4742219795647764, "grad_norm": 1.902103304862976, "learning_rate": 1.0734109694052443e-05, "loss": 0.8591, "step": 3543 }, { "epoch": 1.4746379637573772, "grad_norm": 1.840943694114685, "learning_rate": 1.0729641400430523e-05, "loss": 0.8558, "step": 3544 }, { "epoch": 1.475053947949978, "grad_norm": 1.8054509162902832, "learning_rate": 1.0725172960346459e-05, "loss": 0.802, "step": 3545 }, { "epoch": 1.4754699321425786, "grad_norm": 1.8020094633102417, "learning_rate": 1.0720704374697206e-05, "loss": 0.8973, "step": 3546 }, { "epoch": 1.4758859163351792, "grad_norm": 1.6699117422103882, "learning_rate": 1.071623564437975e-05, "loss": 0.8986, "step": 3547 }, { "epoch": 1.47630190052778, "grad_norm": 1.5202820301055908, "learning_rate": 1.0711766770291112e-05, "loss": 0.791, "step": 3548 }, { "epoch": 1.4767178847203806, "grad_norm": 1.673946738243103, "learning_rate": 1.0707297753328332e-05, "loss": 0.7527, "step": 3549 }, { "epoch": 1.4771338689129814, "grad_norm": 1.7205390930175781, "learning_rate": 1.0702828594388486e-05, "loss": 0.9, "step": 3550 }, { "epoch": 1.477549853105582, "grad_norm": 1.695576786994934, "learning_rate": 1.0698359294368673e-05, "loss": 0.8468, "step": 3551 }, { "epoch": 1.4779658372981825, "grad_norm": 1.76578688621521, "learning_rate": 1.0693889854166024e-05, "loss": 0.9353, "step": 3552 }, { "epoch": 1.4783818214907833, "grad_norm": 1.7158252000808716, "learning_rate": 1.0689420274677697e-05, "loss": 0.8966, "step": 3553 }, { "epoch": 1.4787978056833841, "grad_norm": 1.8520913124084473, "learning_rate": 1.0684950556800879e-05, "loss": 0.9024, "step": 3554 }, { "epoch": 1.4792137898759847, "grad_norm": 1.834018588066101, "learning_rate": 1.0680480701432785e-05, "loss": 0.8024, "step": 3555 }, { "epoch": 1.4796297740685853, "grad_norm": 1.7063974142074585, "learning_rate": 1.0676010709470654e-05, "loss": 0.913, "step": 3556 }, { "epoch": 1.4800457582611861, "grad_norm": 1.5610204935073853, "learning_rate": 1.0671540581811758e-05, "loss": 0.7921, "step": 3557 }, { "epoch": 1.4804617424537867, "grad_norm": 1.7635046243667603, "learning_rate": 1.0667070319353393e-05, "loss": 0.8781, "step": 3558 }, { "epoch": 1.4808777266463875, "grad_norm": 1.6115362644195557, "learning_rate": 1.0662599922992877e-05, "loss": 0.7831, "step": 3559 }, { "epoch": 1.481293710838988, "grad_norm": 1.7454466819763184, "learning_rate": 1.0658129393627566e-05, "loss": 0.8181, "step": 3560 }, { "epoch": 1.4817096950315887, "grad_norm": 1.8176013231277466, "learning_rate": 1.0653658732154835e-05, "loss": 0.7411, "step": 3561 }, { "epoch": 1.4821256792241895, "grad_norm": 1.618687629699707, "learning_rate": 1.0649187939472086e-05, "loss": 0.8162, "step": 3562 }, { "epoch": 1.4825416634167903, "grad_norm": 1.6108893156051636, "learning_rate": 1.0644717016476749e-05, "loss": 0.8874, "step": 3563 }, { "epoch": 1.4829576476093909, "grad_norm": 1.5932682752609253, "learning_rate": 1.0640245964066285e-05, "loss": 0.8063, "step": 3564 }, { "epoch": 1.4833736318019914, "grad_norm": 167.81121826171875, "learning_rate": 1.0635774783138168e-05, "loss": 0.8467, "step": 3565 }, { "epoch": 1.4837896159945922, "grad_norm": 31.148488998413086, "learning_rate": 1.0631303474589912e-05, "loss": 0.9161, "step": 3566 }, { "epoch": 1.4842056001871928, "grad_norm": 1.7917696237564087, "learning_rate": 1.0626832039319048e-05, "loss": 0.8401, "step": 3567 }, { "epoch": 1.4846215843797936, "grad_norm": 1.82732093334198, "learning_rate": 1.0622360478223134e-05, "loss": 0.8872, "step": 3568 }, { "epoch": 1.4850375685723942, "grad_norm": 91.06631469726562, "learning_rate": 1.0617888792199755e-05, "loss": 0.7832, "step": 3569 }, { "epoch": 1.4854535527649948, "grad_norm": 1.7403324842453003, "learning_rate": 1.0613416982146518e-05, "loss": 0.8541, "step": 3570 }, { "epoch": 1.4858695369575956, "grad_norm": 2.023986577987671, "learning_rate": 1.060894504896106e-05, "loss": 0.9004, "step": 3571 }, { "epoch": 1.4862855211501964, "grad_norm": 1.7010281085968018, "learning_rate": 1.060447299354104e-05, "loss": 0.8483, "step": 3572 }, { "epoch": 1.486701505342797, "grad_norm": 1.8647218942642212, "learning_rate": 1.0600000816784135e-05, "loss": 0.9037, "step": 3573 }, { "epoch": 1.4871174895353976, "grad_norm": 59.716854095458984, "learning_rate": 1.059552851958806e-05, "loss": 0.8927, "step": 3574 }, { "epoch": 1.4875334737279984, "grad_norm": 1.6731995344161987, "learning_rate": 1.0591056102850543e-05, "loss": 0.7057, "step": 3575 }, { "epoch": 1.487949457920599, "grad_norm": 13.971428871154785, "learning_rate": 1.058658356746934e-05, "loss": 0.8216, "step": 3576 }, { "epoch": 1.4883654421131998, "grad_norm": 1.7663861513137817, "learning_rate": 1.058211091434223e-05, "loss": 0.9041, "step": 3577 }, { "epoch": 1.4887814263058003, "grad_norm": 1.7239227294921875, "learning_rate": 1.0577638144367017e-05, "loss": 0.8261, "step": 3578 }, { "epoch": 1.489197410498401, "grad_norm": 1.7351329326629639, "learning_rate": 1.057316525844153e-05, "loss": 0.8617, "step": 3579 }, { "epoch": 1.4896133946910017, "grad_norm": 1.5552208423614502, "learning_rate": 1.0568692257463615e-05, "loss": 0.8174, "step": 3580 }, { "epoch": 1.4900293788836025, "grad_norm": 1.6534888744354248, "learning_rate": 1.0564219142331149e-05, "loss": 1.0025, "step": 3581 }, { "epoch": 1.4904453630762031, "grad_norm": 1.6968886852264404, "learning_rate": 1.0559745913942026e-05, "loss": 0.8131, "step": 3582 }, { "epoch": 1.4908613472688037, "grad_norm": 1.5900698900222778, "learning_rate": 1.0555272573194162e-05, "loss": 0.8003, "step": 3583 }, { "epoch": 1.4912773314614045, "grad_norm": 1.7244645357131958, "learning_rate": 1.05507991209855e-05, "loss": 0.6988, "step": 3584 }, { "epoch": 1.491693315654005, "grad_norm": 1.6462956666946411, "learning_rate": 1.0546325558214005e-05, "loss": 0.8017, "step": 3585 }, { "epoch": 1.492109299846606, "grad_norm": 1.8275293111801147, "learning_rate": 1.0541851885777663e-05, "loss": 0.7996, "step": 3586 }, { "epoch": 1.4925252840392065, "grad_norm": 1.767782211303711, "learning_rate": 1.0537378104574481e-05, "loss": 0.9618, "step": 3587 }, { "epoch": 1.492941268231807, "grad_norm": 1.7646862268447876, "learning_rate": 1.0532904215502486e-05, "loss": 0.8205, "step": 3588 }, { "epoch": 1.4933572524244079, "grad_norm": 1.9203267097473145, "learning_rate": 1.0528430219459733e-05, "loss": 0.8875, "step": 3589 }, { "epoch": 1.4937732366170087, "grad_norm": 17.82093620300293, "learning_rate": 1.0523956117344295e-05, "loss": 0.7892, "step": 3590 }, { "epoch": 1.4941892208096093, "grad_norm": 1.7590327262878418, "learning_rate": 1.0519481910054261e-05, "loss": 0.9328, "step": 3591 }, { "epoch": 1.4946052050022098, "grad_norm": 1.7260457277297974, "learning_rate": 1.0515007598487752e-05, "loss": 0.8973, "step": 3592 }, { "epoch": 1.4950211891948106, "grad_norm": 3.891019582748413, "learning_rate": 1.05105331835429e-05, "loss": 0.7417, "step": 3593 }, { "epoch": 1.4954371733874112, "grad_norm": 231.79318237304688, "learning_rate": 1.0506058666117863e-05, "loss": 0.8304, "step": 3594 }, { "epoch": 1.495853157580012, "grad_norm": 1.8152151107788086, "learning_rate": 1.0501584047110818e-05, "loss": 0.8738, "step": 3595 }, { "epoch": 1.4962691417726126, "grad_norm": 1.6715805530548096, "learning_rate": 1.0497109327419967e-05, "loss": 0.8689, "step": 3596 }, { "epoch": 1.4966851259652132, "grad_norm": 1.7627029418945312, "learning_rate": 1.0492634507943524e-05, "loss": 0.8724, "step": 3597 }, { "epoch": 1.497101110157814, "grad_norm": 1.7848281860351562, "learning_rate": 1.0488159589579728e-05, "loss": 0.8964, "step": 3598 }, { "epoch": 1.4975170943504148, "grad_norm": 1.791237235069275, "learning_rate": 1.0483684573226834e-05, "loss": 0.9884, "step": 3599 }, { "epoch": 1.4979330785430154, "grad_norm": 1.7235441207885742, "learning_rate": 1.0479209459783124e-05, "loss": 0.8409, "step": 3600 }, { "epoch": 1.498349062735616, "grad_norm": 4.029375076293945, "learning_rate": 1.0474734250146893e-05, "loss": 0.764, "step": 3601 }, { "epoch": 1.4987650469282168, "grad_norm": 1.6525940895080566, "learning_rate": 1.0470258945216458e-05, "loss": 0.8278, "step": 3602 }, { "epoch": 1.4991810311208174, "grad_norm": 1.6134648323059082, "learning_rate": 1.0465783545890153e-05, "loss": 0.8349, "step": 3603 }, { "epoch": 1.4995970153134182, "grad_norm": 1.5955085754394531, "learning_rate": 1.0461308053066334e-05, "loss": 0.8607, "step": 3604 }, { "epoch": 1.5000129995060187, "grad_norm": 1.6738866567611694, "learning_rate": 1.0456832467643374e-05, "loss": 1.0341, "step": 3605 }, { "epoch": 1.5004289836986193, "grad_norm": 1.899356722831726, "learning_rate": 1.0452356790519664e-05, "loss": 0.8212, "step": 3606 }, { "epoch": 1.5008449678912201, "grad_norm": 2.0242998600006104, "learning_rate": 1.0447881022593616e-05, "loss": 0.775, "step": 3607 }, { "epoch": 1.501260952083821, "grad_norm": 1.629376769065857, "learning_rate": 1.0443405164763654e-05, "loss": 0.8597, "step": 3608 }, { "epoch": 1.5016769362764215, "grad_norm": 1.7988930940628052, "learning_rate": 1.0438929217928228e-05, "loss": 0.8257, "step": 3609 }, { "epoch": 1.502092920469022, "grad_norm": 12.661429405212402, "learning_rate": 1.0434453182985804e-05, "loss": 0.8333, "step": 3610 }, { "epoch": 1.502508904661623, "grad_norm": 1.8745882511138916, "learning_rate": 1.0429977060834864e-05, "loss": 0.9205, "step": 3611 }, { "epoch": 1.5029248888542235, "grad_norm": 1.5994455814361572, "learning_rate": 1.0425500852373902e-05, "loss": 0.8048, "step": 3612 }, { "epoch": 1.5033408730468243, "grad_norm": 1.8690568208694458, "learning_rate": 1.0421024558501438e-05, "loss": 0.9185, "step": 3613 }, { "epoch": 1.5037568572394249, "grad_norm": 1.8725253343582153, "learning_rate": 1.0416548180116005e-05, "loss": 0.8677, "step": 3614 }, { "epoch": 1.5041728414320255, "grad_norm": 1.6436455249786377, "learning_rate": 1.041207171811616e-05, "loss": 0.7812, "step": 3615 }, { "epoch": 1.5045888256246263, "grad_norm": 1.7661467790603638, "learning_rate": 1.0407595173400465e-05, "loss": 0.8657, "step": 3616 }, { "epoch": 1.505004809817227, "grad_norm": 1.8211978673934937, "learning_rate": 1.0403118546867503e-05, "loss": 0.8327, "step": 3617 }, { "epoch": 1.5054207940098276, "grad_norm": 1.6299971342086792, "learning_rate": 1.0398641839415878e-05, "loss": 0.7391, "step": 3618 }, { "epoch": 1.5058367782024282, "grad_norm": 1.6002631187438965, "learning_rate": 1.0394165051944208e-05, "loss": 0.8015, "step": 3619 }, { "epoch": 1.506252762395029, "grad_norm": 1.5945055484771729, "learning_rate": 1.0389688185351121e-05, "loss": 0.7745, "step": 3620 }, { "epoch": 1.5066687465876296, "grad_norm": 1.5866930484771729, "learning_rate": 1.0385211240535268e-05, "loss": 0.9342, "step": 3621 }, { "epoch": 1.5070847307802304, "grad_norm": 1.834367036819458, "learning_rate": 1.0380734218395321e-05, "loss": 0.773, "step": 3622 }, { "epoch": 1.507500714972831, "grad_norm": 1.911558747291565, "learning_rate": 1.0376257119829951e-05, "loss": 0.7337, "step": 3623 }, { "epoch": 1.5079166991654316, "grad_norm": 1.6473078727722168, "learning_rate": 1.0371779945737853e-05, "loss": 0.6985, "step": 3624 }, { "epoch": 1.5083326833580324, "grad_norm": 1.7924119234085083, "learning_rate": 1.0367302697017743e-05, "loss": 0.8512, "step": 3625 }, { "epoch": 1.5087486675506332, "grad_norm": 1.82048761844635, "learning_rate": 1.0362825374568349e-05, "loss": 0.8107, "step": 3626 }, { "epoch": 1.5091646517432338, "grad_norm": 5.129420757293701, "learning_rate": 1.0358347979288403e-05, "loss": 0.7912, "step": 3627 }, { "epoch": 1.5095806359358344, "grad_norm": 146.59544372558594, "learning_rate": 1.0353870512076667e-05, "loss": 0.8112, "step": 3628 }, { "epoch": 1.5099966201284352, "grad_norm": 1.7755521535873413, "learning_rate": 1.0349392973831907e-05, "loss": 0.8085, "step": 3629 }, { "epoch": 1.5104126043210357, "grad_norm": 1.767286777496338, "learning_rate": 1.0344915365452905e-05, "loss": 0.8917, "step": 3630 }, { "epoch": 1.5108285885136366, "grad_norm": 1.8003662824630737, "learning_rate": 1.0340437687838468e-05, "loss": 0.8414, "step": 3631 }, { "epoch": 1.5112445727062371, "grad_norm": 1.8181872367858887, "learning_rate": 1.0335959941887397e-05, "loss": 0.7864, "step": 3632 }, { "epoch": 1.5116605568988377, "grad_norm": 1.8079688549041748, "learning_rate": 1.0331482128498522e-05, "loss": 0.8177, "step": 3633 }, { "epoch": 1.5120765410914385, "grad_norm": 1.875556468963623, "learning_rate": 1.0327004248570682e-05, "loss": 0.8441, "step": 3634 }, { "epoch": 1.5124925252840393, "grad_norm": 1.5581049919128418, "learning_rate": 1.032252630300273e-05, "loss": 0.7455, "step": 3635 }, { "epoch": 1.51290850947664, "grad_norm": 1.686616063117981, "learning_rate": 1.0318048292693532e-05, "loss": 0.8446, "step": 3636 }, { "epoch": 1.5133244936692405, "grad_norm": 1.734534502029419, "learning_rate": 1.0313570218541965e-05, "loss": 0.7901, "step": 3637 }, { "epoch": 1.5137404778618413, "grad_norm": 1.727272868156433, "learning_rate": 1.0309092081446916e-05, "loss": 0.917, "step": 3638 }, { "epoch": 1.5141564620544419, "grad_norm": 1.7666157484054565, "learning_rate": 1.0304613882307296e-05, "loss": 0.7056, "step": 3639 }, { "epoch": 1.5145724462470427, "grad_norm": 1.7901928424835205, "learning_rate": 1.0300135622022018e-05, "loss": 0.8555, "step": 3640 }, { "epoch": 1.5149884304396433, "grad_norm": 1.7109495401382446, "learning_rate": 1.0295657301490017e-05, "loss": 0.9096, "step": 3641 }, { "epoch": 1.5154044146322438, "grad_norm": 1.6514873504638672, "learning_rate": 1.0291178921610226e-05, "loss": 0.8449, "step": 3642 }, { "epoch": 1.5158203988248447, "grad_norm": 26.979022979736328, "learning_rate": 1.02867004832816e-05, "loss": 0.9302, "step": 3643 }, { "epoch": 1.5162363830174455, "grad_norm": 1.7903255224227905, "learning_rate": 1.0282221987403102e-05, "loss": 0.7682, "step": 3644 }, { "epoch": 1.516652367210046, "grad_norm": 1.7144821882247925, "learning_rate": 1.0277743434873712e-05, "loss": 0.8075, "step": 3645 }, { "epoch": 1.5170683514026466, "grad_norm": 1.7183771133422852, "learning_rate": 1.0273264826592418e-05, "loss": 0.7755, "step": 3646 }, { "epoch": 1.5174843355952474, "grad_norm": 3.4081637859344482, "learning_rate": 1.0268786163458212e-05, "loss": 0.91, "step": 3647 }, { "epoch": 1.517900319787848, "grad_norm": 8.289155006408691, "learning_rate": 1.0264307446370105e-05, "loss": 0.8064, "step": 3648 }, { "epoch": 1.5183163039804488, "grad_norm": 1.7059063911437988, "learning_rate": 1.0259828676227123e-05, "loss": 0.8162, "step": 3649 }, { "epoch": 1.5187322881730494, "grad_norm": 2.9266583919525146, "learning_rate": 1.0255349853928294e-05, "loss": 0.8196, "step": 3650 }, { "epoch": 1.51914827236565, "grad_norm": 1.7255988121032715, "learning_rate": 1.025087098037266e-05, "loss": 0.8157, "step": 3651 }, { "epoch": 1.5195642565582508, "grad_norm": 1.5610827207565308, "learning_rate": 1.0246392056459271e-05, "loss": 0.8057, "step": 3652 }, { "epoch": 1.5199802407508516, "grad_norm": 32.51876449584961, "learning_rate": 1.0241913083087191e-05, "loss": 0.8521, "step": 3653 }, { "epoch": 1.5203962249434522, "grad_norm": 1.718944787979126, "learning_rate": 1.0237434061155487e-05, "loss": 0.8791, "step": 3654 }, { "epoch": 1.5208122091360528, "grad_norm": 1.8340412378311157, "learning_rate": 1.023295499156325e-05, "loss": 0.8473, "step": 3655 }, { "epoch": 1.5212281933286536, "grad_norm": 1.8651360273361206, "learning_rate": 1.0228475875209563e-05, "loss": 0.8704, "step": 3656 }, { "epoch": 1.5216441775212541, "grad_norm": 1.8081128597259521, "learning_rate": 1.0223996712993529e-05, "loss": 0.7642, "step": 3657 }, { "epoch": 1.522060161713855, "grad_norm": 1.8158762454986572, "learning_rate": 1.021951750581426e-05, "loss": 0.9317, "step": 3658 }, { "epoch": 1.5224761459064555, "grad_norm": 1.8179622888565063, "learning_rate": 1.021503825457087e-05, "loss": 0.809, "step": 3659 }, { "epoch": 1.522892130099056, "grad_norm": 1.9151004552841187, "learning_rate": 1.0210558960162493e-05, "loss": 0.857, "step": 3660 }, { "epoch": 1.523308114291657, "grad_norm": 1.780120611190796, "learning_rate": 1.020607962348826e-05, "loss": 0.8805, "step": 3661 }, { "epoch": 1.5237240984842577, "grad_norm": 124.05238342285156, "learning_rate": 1.0201600245447318e-05, "loss": 0.8683, "step": 3662 }, { "epoch": 1.5241400826768583, "grad_norm": 1.8281481266021729, "learning_rate": 1.019712082693882e-05, "loss": 0.9537, "step": 3663 }, { "epoch": 1.5245560668694589, "grad_norm": 1.5936923027038574, "learning_rate": 1.0192641368861927e-05, "loss": 0.7893, "step": 3664 }, { "epoch": 1.5249720510620597, "grad_norm": 1.6963719129562378, "learning_rate": 1.0188161872115807e-05, "loss": 0.798, "step": 3665 }, { "epoch": 1.5253880352546603, "grad_norm": 1.7214314937591553, "learning_rate": 1.0183682337599638e-05, "loss": 0.8181, "step": 3666 }, { "epoch": 1.525804019447261, "grad_norm": 1.6853495836257935, "learning_rate": 1.0179202766212605e-05, "loss": 0.7951, "step": 3667 }, { "epoch": 1.5262200036398617, "grad_norm": 34.87892532348633, "learning_rate": 1.0174723158853898e-05, "loss": 0.7426, "step": 3668 }, { "epoch": 1.5266359878324622, "grad_norm": 1.8933550119400024, "learning_rate": 1.0170243516422717e-05, "loss": 0.9535, "step": 3669 }, { "epoch": 1.527051972025063, "grad_norm": 1.5816521644592285, "learning_rate": 1.0165763839818272e-05, "loss": 0.8518, "step": 3670 }, { "epoch": 1.5274679562176638, "grad_norm": 1.63481605052948, "learning_rate": 1.0161284129939772e-05, "loss": 0.8742, "step": 3671 }, { "epoch": 1.5278839404102644, "grad_norm": 1.6945022344589233, "learning_rate": 1.0156804387686436e-05, "loss": 0.8508, "step": 3672 }, { "epoch": 1.528299924602865, "grad_norm": 1.6496853828430176, "learning_rate": 1.0152324613957492e-05, "loss": 0.8195, "step": 3673 }, { "epoch": 1.5287159087954658, "grad_norm": 1.7414718866348267, "learning_rate": 1.014784480965217e-05, "loss": 0.7794, "step": 3674 }, { "epoch": 1.5291318929880664, "grad_norm": 1.6114611625671387, "learning_rate": 1.0143364975669717e-05, "loss": 0.8888, "step": 3675 }, { "epoch": 1.5295478771806672, "grad_norm": 1.6905139684677124, "learning_rate": 1.0138885112909364e-05, "loss": 0.8031, "step": 3676 }, { "epoch": 1.5299638613732678, "grad_norm": 1.814943790435791, "learning_rate": 1.013440522227037e-05, "loss": 0.843, "step": 3677 }, { "epoch": 1.5303798455658684, "grad_norm": 1.6467229127883911, "learning_rate": 1.0129925304651992e-05, "loss": 0.7908, "step": 3678 }, { "epoch": 1.5307958297584692, "grad_norm": 1.7836453914642334, "learning_rate": 1.012544536095349e-05, "loss": 0.9399, "step": 3679 }, { "epoch": 1.53121181395107, "grad_norm": 1.682088017463684, "learning_rate": 1.0120965392074127e-05, "loss": 0.8554, "step": 3680 }, { "epoch": 1.5316277981436706, "grad_norm": 1.8409991264343262, "learning_rate": 1.0116485398913181e-05, "loss": 0.8423, "step": 3681 }, { "epoch": 1.5320437823362711, "grad_norm": 25.396997451782227, "learning_rate": 1.0112005382369923e-05, "loss": 0.7106, "step": 3682 }, { "epoch": 1.532459766528872, "grad_norm": 1.63253653049469, "learning_rate": 1.0107525343343634e-05, "loss": 0.9174, "step": 3683 }, { "epoch": 1.5328757507214725, "grad_norm": 1.6950000524520874, "learning_rate": 1.0103045282733604e-05, "loss": 0.7906, "step": 3684 }, { "epoch": 1.5332917349140733, "grad_norm": 1.7575879096984863, "learning_rate": 1.0098565201439122e-05, "loss": 0.7769, "step": 3685 }, { "epoch": 1.533707719106674, "grad_norm": 1.9668662548065186, "learning_rate": 1.0094085100359478e-05, "loss": 0.822, "step": 3686 }, { "epoch": 1.5341237032992745, "grad_norm": 1.7536394596099854, "learning_rate": 1.0089604980393975e-05, "loss": 0.7445, "step": 3687 }, { "epoch": 1.5345396874918753, "grad_norm": 1.8616869449615479, "learning_rate": 1.0085124842441914e-05, "loss": 0.8746, "step": 3688 }, { "epoch": 1.534955671684476, "grad_norm": 1.5446181297302246, "learning_rate": 1.00806446874026e-05, "loss": 0.7437, "step": 3689 }, { "epoch": 1.5353716558770767, "grad_norm": 1.5795000791549683, "learning_rate": 1.0076164516175346e-05, "loss": 0.7608, "step": 3690 }, { "epoch": 1.5357876400696773, "grad_norm": 1.7779183387756348, "learning_rate": 1.0071684329659457e-05, "loss": 0.8392, "step": 3691 }, { "epoch": 1.536203624262278, "grad_norm": 1.791799545288086, "learning_rate": 1.0067204128754251e-05, "loss": 0.8158, "step": 3692 }, { "epoch": 1.5366196084548787, "grad_norm": 1.786273717880249, "learning_rate": 1.006272391435905e-05, "loss": 0.8111, "step": 3693 }, { "epoch": 1.5370355926474795, "grad_norm": 2.2267096042633057, "learning_rate": 1.0058243687373168e-05, "loss": 0.7291, "step": 3694 }, { "epoch": 1.53745157684008, "grad_norm": 1.6122955083847046, "learning_rate": 1.005376344869594e-05, "loss": 0.8119, "step": 3695 }, { "epoch": 1.5378675610326806, "grad_norm": 1.7052314281463623, "learning_rate": 1.0049283199226678e-05, "loss": 0.8099, "step": 3696 }, { "epoch": 1.5382835452252814, "grad_norm": 135.59938049316406, "learning_rate": 1.0044802939864717e-05, "loss": 0.7942, "step": 3697 }, { "epoch": 1.5386995294178822, "grad_norm": 1.967625617980957, "learning_rate": 1.0040322671509389e-05, "loss": 0.9338, "step": 3698 }, { "epoch": 1.5391155136104828, "grad_norm": 1.8001070022583008, "learning_rate": 1.0035842395060022e-05, "loss": 0.9045, "step": 3699 }, { "epoch": 1.5395314978030834, "grad_norm": 1.7143478393554688, "learning_rate": 1.003136211141595e-05, "loss": 0.7287, "step": 3700 }, { "epoch": 1.5399474819956842, "grad_norm": 1.7038905620574951, "learning_rate": 1.002688182147651e-05, "loss": 0.8154, "step": 3701 }, { "epoch": 1.5403634661882848, "grad_norm": 1.617295265197754, "learning_rate": 1.0022401526141038e-05, "loss": 0.7123, "step": 3702 }, { "epoch": 1.5407794503808856, "grad_norm": 1.7901426553726196, "learning_rate": 1.0017921226308867e-05, "loss": 0.7804, "step": 3703 }, { "epoch": 1.5411954345734862, "grad_norm": 1.6821683645248413, "learning_rate": 1.0013440922879338e-05, "loss": 0.935, "step": 3704 }, { "epoch": 1.5416114187660868, "grad_norm": 1.772094964981079, "learning_rate": 1.0008960616751795e-05, "loss": 0.936, "step": 3705 }, { "epoch": 1.5420274029586876, "grad_norm": 1.709349274635315, "learning_rate": 1.0004480308825569e-05, "loss": 0.8099, "step": 3706 }, { "epoch": 1.5424433871512884, "grad_norm": 1.7033330202102661, "learning_rate": 1e-05, "loss": 0.9184, "step": 3707 }, { "epoch": 1.542859371343889, "grad_norm": 1.7531496286392212, "learning_rate": 9.995519691174433e-06, "loss": 0.8361, "step": 3708 }, { "epoch": 1.5432753555364895, "grad_norm": 1.725057601928711, "learning_rate": 9.991039383248209e-06, "loss": 0.8106, "step": 3709 }, { "epoch": 1.5436913397290903, "grad_norm": 1.6793098449707031, "learning_rate": 9.986559077120662e-06, "loss": 0.8145, "step": 3710 }, { "epoch": 1.544107323921691, "grad_norm": 1.6632853746414185, "learning_rate": 9.982078773691136e-06, "loss": 0.8062, "step": 3711 }, { "epoch": 1.5445233081142917, "grad_norm": 1.8929964303970337, "learning_rate": 9.977598473858964e-06, "loss": 0.8427, "step": 3712 }, { "epoch": 1.5449392923068923, "grad_norm": 1.8019847869873047, "learning_rate": 9.973118178523493e-06, "loss": 0.8048, "step": 3713 }, { "epoch": 1.545355276499493, "grad_norm": 1.7576932907104492, "learning_rate": 9.968637888584051e-06, "loss": 0.9314, "step": 3714 }, { "epoch": 1.5457712606920937, "grad_norm": 1.7065386772155762, "learning_rate": 9.964157604939983e-06, "loss": 0.8968, "step": 3715 }, { "epoch": 1.5461872448846945, "grad_norm": 1.6291375160217285, "learning_rate": 9.959677328490613e-06, "loss": 0.8183, "step": 3716 }, { "epoch": 1.546603229077295, "grad_norm": 1.8194000720977783, "learning_rate": 9.955197060135286e-06, "loss": 0.8877, "step": 3717 }, { "epoch": 1.5470192132698957, "grad_norm": 1.6476819515228271, "learning_rate": 9.950716800773325e-06, "loss": 0.8226, "step": 3718 }, { "epoch": 1.5474351974624965, "grad_norm": 1.758102536201477, "learning_rate": 9.946236551304067e-06, "loss": 0.874, "step": 3719 }, { "epoch": 1.547851181655097, "grad_norm": 1.65716552734375, "learning_rate": 9.941756312626833e-06, "loss": 0.8292, "step": 3720 }, { "epoch": 1.5482671658476979, "grad_norm": 1.6334025859832764, "learning_rate": 9.937276085640957e-06, "loss": 0.8624, "step": 3721 }, { "epoch": 1.5486831500402984, "grad_norm": 1.7146952152252197, "learning_rate": 9.932795871245752e-06, "loss": 0.7845, "step": 3722 }, { "epoch": 1.549099134232899, "grad_norm": 1.7552666664123535, "learning_rate": 9.928315670340545e-06, "loss": 0.8451, "step": 3723 }, { "epoch": 1.5495151184254998, "grad_norm": 1.654988408088684, "learning_rate": 9.923835483824658e-06, "loss": 0.8246, "step": 3724 }, { "epoch": 1.5499311026181006, "grad_norm": 1.831310510635376, "learning_rate": 9.9193553125974e-06, "loss": 0.9367, "step": 3725 }, { "epoch": 1.5503470868107012, "grad_norm": 1.9258496761322021, "learning_rate": 9.914875157558088e-06, "loss": 0.8758, "step": 3726 }, { "epoch": 1.5507630710033018, "grad_norm": 1.7906709909439087, "learning_rate": 9.910395019606025e-06, "loss": 0.8471, "step": 3727 }, { "epoch": 1.5511790551959026, "grad_norm": 1.7515451908111572, "learning_rate": 9.905914899640525e-06, "loss": 0.8491, "step": 3728 }, { "epoch": 1.5515950393885032, "grad_norm": 1.7374001741409302, "learning_rate": 9.901434798560883e-06, "loss": 0.9086, "step": 3729 }, { "epoch": 1.552011023581104, "grad_norm": 1.7914838790893555, "learning_rate": 9.896954717266401e-06, "loss": 0.9377, "step": 3730 }, { "epoch": 1.5524270077737046, "grad_norm": 1.7058097124099731, "learning_rate": 9.89247465665637e-06, "loss": 0.796, "step": 3731 }, { "epoch": 1.5528429919663052, "grad_norm": 172.68203735351562, "learning_rate": 9.887994617630084e-06, "loss": 0.8058, "step": 3732 }, { "epoch": 1.553258976158906, "grad_norm": 1.833154559135437, "learning_rate": 9.883514601086822e-06, "loss": 0.8484, "step": 3733 }, { "epoch": 1.5536749603515068, "grad_norm": 1.7484874725341797, "learning_rate": 9.879034607925876e-06, "loss": 0.7621, "step": 3734 }, { "epoch": 1.5540909445441073, "grad_norm": 1.6762423515319824, "learning_rate": 9.874554639046514e-06, "loss": 0.9228, "step": 3735 }, { "epoch": 1.554506928736708, "grad_norm": 1.6697331666946411, "learning_rate": 9.870074695348013e-06, "loss": 0.7644, "step": 3736 }, { "epoch": 1.5549229129293087, "grad_norm": 1.65029776096344, "learning_rate": 9.86559477772963e-06, "loss": 0.8497, "step": 3737 }, { "epoch": 1.5553388971219093, "grad_norm": 1.926047444343567, "learning_rate": 9.861114887090637e-06, "loss": 0.8838, "step": 3738 }, { "epoch": 1.5557548813145101, "grad_norm": 1.7016565799713135, "learning_rate": 9.856635024330288e-06, "loss": 0.7918, "step": 3739 }, { "epoch": 1.5561708655071107, "grad_norm": 1.8030577898025513, "learning_rate": 9.85215519034783e-06, "loss": 0.9104, "step": 3740 }, { "epoch": 1.5565868496997113, "grad_norm": 1.7530134916305542, "learning_rate": 9.847675386042512e-06, "loss": 0.8496, "step": 3741 }, { "epoch": 1.557002833892312, "grad_norm": 1.8479523658752441, "learning_rate": 9.843195612313565e-06, "loss": 0.7912, "step": 3742 }, { "epoch": 1.557418818084913, "grad_norm": 1.8051509857177734, "learning_rate": 9.838715870060231e-06, "loss": 0.8392, "step": 3743 }, { "epoch": 1.5578348022775135, "grad_norm": 2.0537140369415283, "learning_rate": 9.83423616018173e-06, "loss": 0.9286, "step": 3744 }, { "epoch": 1.558250786470114, "grad_norm": 1.6484702825546265, "learning_rate": 9.829756483577284e-06, "loss": 0.7791, "step": 3745 }, { "epoch": 1.5586667706627149, "grad_norm": 1.650821328163147, "learning_rate": 9.825276841146104e-06, "loss": 0.8372, "step": 3746 }, { "epoch": 1.5590827548553154, "grad_norm": 1.8072609901428223, "learning_rate": 9.8207972337874e-06, "loss": 0.7933, "step": 3747 }, { "epoch": 1.5594987390479162, "grad_norm": 1.7864389419555664, "learning_rate": 9.816317662400364e-06, "loss": 0.8186, "step": 3748 }, { "epoch": 1.5599147232405168, "grad_norm": 13.842751502990723, "learning_rate": 9.811838127884196e-06, "loss": 0.7977, "step": 3749 }, { "epoch": 1.5603307074331174, "grad_norm": 1.7188912630081177, "learning_rate": 9.807358631138076e-06, "loss": 0.7514, "step": 3750 }, { "epoch": 1.5607466916257182, "grad_norm": 1.722713828086853, "learning_rate": 9.802879173061186e-06, "loss": 0.7872, "step": 3751 }, { "epoch": 1.561162675818319, "grad_norm": 1.7071986198425293, "learning_rate": 9.798399754552684e-06, "loss": 0.8949, "step": 3752 }, { "epoch": 1.5615786600109196, "grad_norm": 1.6344497203826904, "learning_rate": 9.793920376511741e-06, "loss": 0.884, "step": 3753 }, { "epoch": 1.5619946442035202, "grad_norm": 2030.3031005859375, "learning_rate": 9.78944103983751e-06, "loss": 0.9256, "step": 3754 }, { "epoch": 1.562410628396121, "grad_norm": 1.8441845178604126, "learning_rate": 9.78496174542913e-06, "loss": 0.9236, "step": 3755 }, { "epoch": 1.5628266125887216, "grad_norm": 1.7635552883148193, "learning_rate": 9.780482494185743e-06, "loss": 0.7546, "step": 3756 }, { "epoch": 1.5632425967813224, "grad_norm": 1.5990710258483887, "learning_rate": 9.77600328700647e-06, "loss": 0.7825, "step": 3757 }, { "epoch": 1.563658580973923, "grad_norm": 1.8262319564819336, "learning_rate": 9.771524124790439e-06, "loss": 0.7787, "step": 3758 }, { "epoch": 1.5640745651665235, "grad_norm": 3.82490611076355, "learning_rate": 9.767045008436753e-06, "loss": 0.816, "step": 3759 }, { "epoch": 1.5644905493591243, "grad_norm": 540.4978637695312, "learning_rate": 9.762565938844517e-06, "loss": 0.9499, "step": 3760 }, { "epoch": 1.5649065335517252, "grad_norm": 8.162109375, "learning_rate": 9.758086916912812e-06, "loss": 0.8286, "step": 3761 }, { "epoch": 1.5653225177443257, "grad_norm": 1.9672476053237915, "learning_rate": 9.753607943540734e-06, "loss": 0.9164, "step": 3762 }, { "epoch": 1.5657385019369263, "grad_norm": 1.7399566173553467, "learning_rate": 9.749129019627343e-06, "loss": 0.7609, "step": 3763 }, { "epoch": 1.5661544861295271, "grad_norm": 1.6258749961853027, "learning_rate": 9.74465014607171e-06, "loss": 0.7453, "step": 3764 }, { "epoch": 1.5665704703221277, "grad_norm": 1.9305442571640015, "learning_rate": 9.740171323772878e-06, "loss": 0.8976, "step": 3765 }, { "epoch": 1.5669864545147285, "grad_norm": 3.740009307861328, "learning_rate": 9.735692553629898e-06, "loss": 0.6991, "step": 3766 }, { "epoch": 1.567402438707329, "grad_norm": 1.9394464492797852, "learning_rate": 9.731213836541792e-06, "loss": 0.9036, "step": 3767 }, { "epoch": 1.5678184228999297, "grad_norm": 1.727446436882019, "learning_rate": 9.726735173407587e-06, "loss": 0.8742, "step": 3768 }, { "epoch": 1.5682344070925305, "grad_norm": 1.942276120185852, "learning_rate": 9.722256565126291e-06, "loss": 0.9048, "step": 3769 }, { "epoch": 1.5686503912851313, "grad_norm": 1.740583062171936, "learning_rate": 9.717778012596898e-06, "loss": 0.9765, "step": 3770 }, { "epoch": 1.5690663754777319, "grad_norm": 1.738054633140564, "learning_rate": 9.713299516718404e-06, "loss": 0.834, "step": 3771 }, { "epoch": 1.5694823596703324, "grad_norm": 1.6589854955673218, "learning_rate": 9.708821078389775e-06, "loss": 0.7856, "step": 3772 }, { "epoch": 1.5698983438629333, "grad_norm": 1.8338536024093628, "learning_rate": 9.704342698509985e-06, "loss": 0.8021, "step": 3773 }, { "epoch": 1.5703143280555338, "grad_norm": 1.6098741292953491, "learning_rate": 9.699864377977982e-06, "loss": 0.7244, "step": 3774 }, { "epoch": 1.5707303122481346, "grad_norm": 1.6935553550720215, "learning_rate": 9.695386117692708e-06, "loss": 0.8492, "step": 3775 }, { "epoch": 1.5711462964407352, "grad_norm": 15.912033081054688, "learning_rate": 9.690907918553087e-06, "loss": 0.7814, "step": 3776 }, { "epoch": 1.5715622806333358, "grad_norm": 44.53912353515625, "learning_rate": 9.686429781458042e-06, "loss": 0.8888, "step": 3777 }, { "epoch": 1.5719782648259366, "grad_norm": 1.754641056060791, "learning_rate": 9.681951707306471e-06, "loss": 0.8291, "step": 3778 }, { "epoch": 1.5723942490185374, "grad_norm": 1.8805148601531982, "learning_rate": 9.677473696997274e-06, "loss": 0.8418, "step": 3779 }, { "epoch": 1.572810233211138, "grad_norm": 1.8696659803390503, "learning_rate": 9.672995751429321e-06, "loss": 0.8133, "step": 3780 }, { "epoch": 1.5732262174037386, "grad_norm": 1.7989197969436646, "learning_rate": 9.668517871501483e-06, "loss": 0.7996, "step": 3781 }, { "epoch": 1.5736422015963394, "grad_norm": 2.0571651458740234, "learning_rate": 9.664040058112607e-06, "loss": 0.8647, "step": 3782 }, { "epoch": 1.57405818578894, "grad_norm": 1.7338565587997437, "learning_rate": 9.659562312161537e-06, "loss": 0.8317, "step": 3783 }, { "epoch": 1.5744741699815408, "grad_norm": 3.620126724243164, "learning_rate": 9.655084634547097e-06, "loss": 0.8633, "step": 3784 }, { "epoch": 1.5748901541741414, "grad_norm": 3.6358253955841064, "learning_rate": 9.650607026168095e-06, "loss": 0.8173, "step": 3785 }, { "epoch": 1.575306138366742, "grad_norm": 5.580855846405029, "learning_rate": 9.646129487923338e-06, "loss": 0.82, "step": 3786 }, { "epoch": 1.5757221225593427, "grad_norm": 1.8812761306762695, "learning_rate": 9.641652020711597e-06, "loss": 0.7905, "step": 3787 }, { "epoch": 1.5761381067519435, "grad_norm": 75.01824951171875, "learning_rate": 9.637174625431654e-06, "loss": 0.8479, "step": 3788 }, { "epoch": 1.5765540909445441, "grad_norm": 1.8484433889389038, "learning_rate": 9.632697302982257e-06, "loss": 0.8102, "step": 3789 }, { "epoch": 1.5769700751371447, "grad_norm": 266.2782897949219, "learning_rate": 9.62822005426215e-06, "loss": 0.8853, "step": 3790 }, { "epoch": 1.5773860593297455, "grad_norm": 1.6957738399505615, "learning_rate": 9.623742880170052e-06, "loss": 0.8036, "step": 3791 }, { "epoch": 1.577802043522346, "grad_norm": 1.7256609201431274, "learning_rate": 9.619265781604684e-06, "loss": 0.8655, "step": 3792 }, { "epoch": 1.578218027714947, "grad_norm": 1.7879846096038818, "learning_rate": 9.614788759464733e-06, "loss": 0.8199, "step": 3793 }, { "epoch": 1.5786340119075475, "grad_norm": 1.8168466091156006, "learning_rate": 9.610311814648886e-06, "loss": 0.8993, "step": 3794 }, { "epoch": 1.579049996100148, "grad_norm": 1.682291030883789, "learning_rate": 9.605834948055797e-06, "loss": 0.7689, "step": 3795 }, { "epoch": 1.5794659802927489, "grad_norm": 64.10923767089844, "learning_rate": 9.601358160584127e-06, "loss": 0.8149, "step": 3796 }, { "epoch": 1.5798819644853497, "grad_norm": 1.6957852840423584, "learning_rate": 9.596881453132499e-06, "loss": 0.8241, "step": 3797 }, { "epoch": 1.5802979486779503, "grad_norm": 1.634209394454956, "learning_rate": 9.59240482659954e-06, "loss": 0.7744, "step": 3798 }, { "epoch": 1.5807139328705508, "grad_norm": 1.8386415243148804, "learning_rate": 9.587928281883844e-06, "loss": 0.7998, "step": 3799 }, { "epoch": 1.5811299170631516, "grad_norm": 1.6578879356384277, "learning_rate": 9.583451819883993e-06, "loss": 0.853, "step": 3800 }, { "epoch": 1.5815459012557522, "grad_norm": 1.7919050455093384, "learning_rate": 9.578975441498566e-06, "loss": 0.784, "step": 3801 }, { "epoch": 1.581961885448353, "grad_norm": 1.8478660583496094, "learning_rate": 9.5744991476261e-06, "loss": 0.7259, "step": 3802 }, { "epoch": 1.5823778696409536, "grad_norm": 12.485679626464844, "learning_rate": 9.57002293916514e-06, "loss": 0.8093, "step": 3803 }, { "epoch": 1.5827938538335542, "grad_norm": 1.851240634918213, "learning_rate": 9.565546817014195e-06, "loss": 0.7857, "step": 3804 }, { "epoch": 1.583209838026155, "grad_norm": 1.8717315196990967, "learning_rate": 9.561070782071773e-06, "loss": 0.9078, "step": 3805 }, { "epoch": 1.5836258222187558, "grad_norm": 1.8231186866760254, "learning_rate": 9.556594835236348e-06, "loss": 0.755, "step": 3806 }, { "epoch": 1.5840418064113564, "grad_norm": 1.8091000318527222, "learning_rate": 9.552118977406389e-06, "loss": 0.8933, "step": 3807 }, { "epoch": 1.584457790603957, "grad_norm": 1.9223500490188599, "learning_rate": 9.547643209480339e-06, "loss": 0.9075, "step": 3808 }, { "epoch": 1.5848737747965578, "grad_norm": 1.7838513851165771, "learning_rate": 9.543167532356631e-06, "loss": 0.8691, "step": 3809 }, { "epoch": 1.5852897589891584, "grad_norm": 1.66860032081604, "learning_rate": 9.538691946933668e-06, "loss": 0.8322, "step": 3810 }, { "epoch": 1.5857057431817592, "grad_norm": 1.7630473375320435, "learning_rate": 9.534216454109852e-06, "loss": 0.8614, "step": 3811 }, { "epoch": 1.5861217273743597, "grad_norm": 1.7191457748413086, "learning_rate": 9.529741054783545e-06, "loss": 0.8271, "step": 3812 }, { "epoch": 1.5865377115669603, "grad_norm": 2.2471137046813965, "learning_rate": 9.52526574985311e-06, "loss": 0.8732, "step": 3813 }, { "epoch": 1.5869536957595611, "grad_norm": 1.6863954067230225, "learning_rate": 9.52079054021688e-06, "loss": 0.8598, "step": 3814 }, { "epoch": 1.587369679952162, "grad_norm": 1.678280234336853, "learning_rate": 9.516315426773166e-06, "loss": 0.7511, "step": 3815 }, { "epoch": 1.5877856641447625, "grad_norm": 1.5937724113464355, "learning_rate": 9.511840410420277e-06, "loss": 0.8677, "step": 3816 }, { "epoch": 1.588201648337363, "grad_norm": 1.6054637432098389, "learning_rate": 9.507365492056478e-06, "loss": 0.7849, "step": 3817 }, { "epoch": 1.588617632529964, "grad_norm": 1.7078056335449219, "learning_rate": 9.502890672580035e-06, "loss": 0.8394, "step": 3818 }, { "epoch": 1.5890336167225645, "grad_norm": 1.878159523010254, "learning_rate": 9.498415952889182e-06, "loss": 0.7791, "step": 3819 }, { "epoch": 1.5894496009151653, "grad_norm": 1.8089320659637451, "learning_rate": 9.49394133388214e-06, "loss": 0.8641, "step": 3820 }, { "epoch": 1.5898655851077659, "grad_norm": 1.8133010864257812, "learning_rate": 9.489466816457102e-06, "loss": 0.9108, "step": 3821 }, { "epoch": 1.5902815693003665, "grad_norm": 1.8262760639190674, "learning_rate": 9.484992401512253e-06, "loss": 0.8833, "step": 3822 }, { "epoch": 1.5906975534929673, "grad_norm": 1.8929260969161987, "learning_rate": 9.480518089945742e-06, "loss": 0.902, "step": 3823 }, { "epoch": 1.591113537685568, "grad_norm": 1.6937977075576782, "learning_rate": 9.476043882655712e-06, "loss": 0.8123, "step": 3824 }, { "epoch": 1.5915295218781687, "grad_norm": 1.7142488956451416, "learning_rate": 9.471569780540269e-06, "loss": 0.8241, "step": 3825 }, { "epoch": 1.5919455060707692, "grad_norm": 1.589643955230713, "learning_rate": 9.467095784497517e-06, "loss": 0.7846, "step": 3826 }, { "epoch": 1.59236149026337, "grad_norm": 1.709787368774414, "learning_rate": 9.462621895425522e-06, "loss": 0.8984, "step": 3827 }, { "epoch": 1.5927774744559706, "grad_norm": 1.6847519874572754, "learning_rate": 9.45814811422234e-06, "loss": 0.812, "step": 3828 }, { "epoch": 1.5931934586485714, "grad_norm": 1.5534415245056152, "learning_rate": 9.453674441785996e-06, "loss": 0.8134, "step": 3829 }, { "epoch": 1.593609442841172, "grad_norm": 1.8408069610595703, "learning_rate": 9.4492008790145e-06, "loss": 0.8939, "step": 3830 }, { "epoch": 1.5940254270337726, "grad_norm": 1.6529312133789062, "learning_rate": 9.444727426805841e-06, "loss": 0.7052, "step": 3831 }, { "epoch": 1.5944414112263734, "grad_norm": 18.219324111938477, "learning_rate": 9.440254086057977e-06, "loss": 0.8189, "step": 3832 }, { "epoch": 1.5948573954189742, "grad_norm": 1.5650490522384644, "learning_rate": 9.435780857668854e-06, "loss": 0.8145, "step": 3833 }, { "epoch": 1.5952733796115748, "grad_norm": 1.765276551246643, "learning_rate": 9.431307742536385e-06, "loss": 0.8393, "step": 3834 }, { "epoch": 1.5956893638041754, "grad_norm": 1.6924018859863281, "learning_rate": 9.426834741558472e-06, "loss": 0.936, "step": 3835 }, { "epoch": 1.5961053479967762, "grad_norm": 1.7398265600204468, "learning_rate": 9.422361855632983e-06, "loss": 0.8157, "step": 3836 }, { "epoch": 1.5965213321893768, "grad_norm": 1.7345188856124878, "learning_rate": 9.417889085657774e-06, "loss": 0.8969, "step": 3837 }, { "epoch": 1.5969373163819776, "grad_norm": 1.7288767099380493, "learning_rate": 9.413416432530663e-06, "loss": 0.7294, "step": 3838 }, { "epoch": 1.5973533005745781, "grad_norm": 1.7590975761413574, "learning_rate": 9.408943897149462e-06, "loss": 0.8163, "step": 3839 }, { "epoch": 1.5977692847671787, "grad_norm": 1.5285745859146118, "learning_rate": 9.404471480411943e-06, "loss": 0.6424, "step": 3840 }, { "epoch": 1.5981852689597795, "grad_norm": 1.6028180122375488, "learning_rate": 9.39999918321587e-06, "loss": 0.7476, "step": 3841 }, { "epoch": 1.5986012531523803, "grad_norm": 1.8781211376190186, "learning_rate": 9.395527006458964e-06, "loss": 0.8512, "step": 3842 }, { "epoch": 1.599017237344981, "grad_norm": 1.7427490949630737, "learning_rate": 9.391054951038944e-06, "loss": 0.8632, "step": 3843 }, { "epoch": 1.5994332215375815, "grad_norm": 1.837950587272644, "learning_rate": 9.386583017853484e-06, "loss": 0.8829, "step": 3844 }, { "epoch": 1.5998492057301823, "grad_norm": 20.68440818786621, "learning_rate": 9.382111207800246e-06, "loss": 0.6605, "step": 3845 }, { "epoch": 1.6002651899227829, "grad_norm": 1.721832036972046, "learning_rate": 9.377639521776868e-06, "loss": 0.803, "step": 3846 }, { "epoch": 1.6006811741153837, "grad_norm": 1.940321445465088, "learning_rate": 9.373167960680954e-06, "loss": 0.8729, "step": 3847 }, { "epoch": 1.6010971583079843, "grad_norm": 1.7048726081848145, "learning_rate": 9.368696525410092e-06, "loss": 0.8367, "step": 3848 }, { "epoch": 1.6015131425005849, "grad_norm": 2.374946117401123, "learning_rate": 9.364225216861832e-06, "loss": 0.9084, "step": 3849 }, { "epoch": 1.6019291266931857, "grad_norm": 1.6959224939346313, "learning_rate": 9.35975403593372e-06, "loss": 0.8277, "step": 3850 }, { "epoch": 1.6023451108857865, "grad_norm": 6.636180400848389, "learning_rate": 9.355282983523251e-06, "loss": 0.7452, "step": 3851 }, { "epoch": 1.602761095078387, "grad_norm": 1.6826326847076416, "learning_rate": 9.350812060527917e-06, "loss": 0.7629, "step": 3852 }, { "epoch": 1.6031770792709876, "grad_norm": 1.8034454584121704, "learning_rate": 9.34634126784517e-06, "loss": 0.7769, "step": 3853 }, { "epoch": 1.6035930634635884, "grad_norm": 1.61942720413208, "learning_rate": 9.34187060637244e-06, "loss": 0.8124, "step": 3854 }, { "epoch": 1.604009047656189, "grad_norm": 1.8147859573364258, "learning_rate": 9.337400077007126e-06, "loss": 0.792, "step": 3855 }, { "epoch": 1.6044250318487898, "grad_norm": 1.6918917894363403, "learning_rate": 9.332929680646614e-06, "loss": 0.7245, "step": 3856 }, { "epoch": 1.6048410160413904, "grad_norm": 1.6293699741363525, "learning_rate": 9.328459418188243e-06, "loss": 0.8482, "step": 3857 }, { "epoch": 1.605257000233991, "grad_norm": 1.765343427658081, "learning_rate": 9.323989290529349e-06, "loss": 0.7734, "step": 3858 }, { "epoch": 1.6056729844265918, "grad_norm": 5.819189548492432, "learning_rate": 9.319519298567218e-06, "loss": 0.8523, "step": 3859 }, { "epoch": 1.6060889686191926, "grad_norm": 1.7618821859359741, "learning_rate": 9.315049443199121e-06, "loss": 0.8964, "step": 3860 }, { "epoch": 1.6065049528117932, "grad_norm": 1.739582896232605, "learning_rate": 9.310579725322305e-06, "loss": 0.9377, "step": 3861 }, { "epoch": 1.6069209370043938, "grad_norm": 5.990169048309326, "learning_rate": 9.306110145833979e-06, "loss": 0.7556, "step": 3862 }, { "epoch": 1.6073369211969946, "grad_norm": 1.8507963418960571, "learning_rate": 9.301640705631332e-06, "loss": 0.7928, "step": 3863 }, { "epoch": 1.6077529053895951, "grad_norm": 1.602355718612671, "learning_rate": 9.297171405611515e-06, "loss": 0.7838, "step": 3864 }, { "epoch": 1.608168889582196, "grad_norm": 1.7007732391357422, "learning_rate": 9.29270224667167e-06, "loss": 0.7532, "step": 3865 }, { "epoch": 1.6085848737747965, "grad_norm": 1.6487809419631958, "learning_rate": 9.288233229708887e-06, "loss": 0.8413, "step": 3866 }, { "epoch": 1.6090008579673971, "grad_norm": 1.7653218507766724, "learning_rate": 9.283764355620251e-06, "loss": 0.8885, "step": 3867 }, { "epoch": 1.609416842159998, "grad_norm": 1.7955513000488281, "learning_rate": 9.279295625302797e-06, "loss": 0.8561, "step": 3868 }, { "epoch": 1.6098328263525987, "grad_norm": 1.8402400016784668, "learning_rate": 9.274827039653546e-06, "loss": 0.8863, "step": 3869 }, { "epoch": 1.6102488105451993, "grad_norm": 1.7419997453689575, "learning_rate": 9.27035859956948e-06, "loss": 0.7885, "step": 3870 }, { "epoch": 1.6106647947377999, "grad_norm": 1.7362596988677979, "learning_rate": 9.265890305947562e-06, "loss": 0.79, "step": 3871 }, { "epoch": 1.6110807789304007, "grad_norm": 1.843137502670288, "learning_rate": 9.261422159684717e-06, "loss": 0.8289, "step": 3872 }, { "epoch": 1.6114967631230013, "grad_norm": 1.8808088302612305, "learning_rate": 9.256954161677845e-06, "loss": 0.7686, "step": 3873 }, { "epoch": 1.611912747315602, "grad_norm": 44.69729232788086, "learning_rate": 9.25248631282381e-06, "loss": 0.7351, "step": 3874 }, { "epoch": 1.6123287315082027, "grad_norm": 1.5960047245025635, "learning_rate": 9.248018614019453e-06, "loss": 0.673, "step": 3875 }, { "epoch": 1.6127447157008032, "grad_norm": 1.6164721250534058, "learning_rate": 9.243551066161591e-06, "loss": 0.8112, "step": 3876 }, { "epoch": 1.613160699893404, "grad_norm": 1.7959150075912476, "learning_rate": 9.239083670146992e-06, "loss": 0.8816, "step": 3877 }, { "epoch": 1.6135766840860049, "grad_norm": 215.7467041015625, "learning_rate": 9.234616426872411e-06, "loss": 0.9528, "step": 3878 }, { "epoch": 1.6139926682786054, "grad_norm": 1.5869534015655518, "learning_rate": 9.230149337234556e-06, "loss": 0.8056, "step": 3879 }, { "epoch": 1.614408652471206, "grad_norm": 38.128662109375, "learning_rate": 9.225682402130125e-06, "loss": 0.8459, "step": 3880 }, { "epoch": 1.6148246366638068, "grad_norm": 1.732145071029663, "learning_rate": 9.221215622455763e-06, "loss": 0.7801, "step": 3881 }, { "epoch": 1.6152406208564074, "grad_norm": 1.7907058000564575, "learning_rate": 9.216748999108103e-06, "loss": 0.8713, "step": 3882 }, { "epoch": 1.6156566050490082, "grad_norm": 1.7960093021392822, "learning_rate": 9.212282532983731e-06, "loss": 0.8175, "step": 3883 }, { "epoch": 1.6160725892416088, "grad_norm": 1.6929816007614136, "learning_rate": 9.207816224979214e-06, "loss": 0.7079, "step": 3884 }, { "epoch": 1.6164885734342094, "grad_norm": 1.8059382438659668, "learning_rate": 9.203350075991074e-06, "loss": 0.922, "step": 3885 }, { "epoch": 1.6169045576268102, "grad_norm": 1.6808120012283325, "learning_rate": 9.198884086915816e-06, "loss": 0.7115, "step": 3886 }, { "epoch": 1.617320541819411, "grad_norm": 2.1148314476013184, "learning_rate": 9.194418258649902e-06, "loss": 0.8738, "step": 3887 }, { "epoch": 1.6177365260120116, "grad_norm": 1.8171311616897583, "learning_rate": 9.189952592089768e-06, "loss": 0.9488, "step": 3888 }, { "epoch": 1.6181525102046121, "grad_norm": 3.949686288833618, "learning_rate": 9.185487088131808e-06, "loss": 0.8032, "step": 3889 }, { "epoch": 1.618568494397213, "grad_norm": 1.6925921440124512, "learning_rate": 9.181021747672393e-06, "loss": 0.7888, "step": 3890 }, { "epoch": 1.6189844785898135, "grad_norm": 1.8473179340362549, "learning_rate": 9.176556571607866e-06, "loss": 0.9769, "step": 3891 }, { "epoch": 1.6194004627824143, "grad_norm": 1.6812000274658203, "learning_rate": 9.172091560834521e-06, "loss": 0.8763, "step": 3892 }, { "epoch": 1.619816446975015, "grad_norm": 1.8916281461715698, "learning_rate": 9.16762671624863e-06, "loss": 0.8628, "step": 3893 }, { "epoch": 1.6202324311676155, "grad_norm": 1.722577691078186, "learning_rate": 9.163162038746424e-06, "loss": 0.8152, "step": 3894 }, { "epoch": 1.6206484153602163, "grad_norm": 1.7004306316375732, "learning_rate": 9.158697529224116e-06, "loss": 0.8104, "step": 3895 }, { "epoch": 1.6210643995528171, "grad_norm": 1.6869863271713257, "learning_rate": 9.154233188577865e-06, "loss": 0.831, "step": 3896 }, { "epoch": 1.6214803837454177, "grad_norm": 1.7444727420806885, "learning_rate": 9.149769017703807e-06, "loss": 0.8038, "step": 3897 }, { "epoch": 1.6218963679380183, "grad_norm": 1.832688570022583, "learning_rate": 9.145305017498045e-06, "loss": 0.7245, "step": 3898 }, { "epoch": 1.622312352130619, "grad_norm": 1.7518116235733032, "learning_rate": 9.140841188856646e-06, "loss": 0.8909, "step": 3899 }, { "epoch": 1.6227283363232197, "grad_norm": 1.7116482257843018, "learning_rate": 9.136377532675636e-06, "loss": 0.7126, "step": 3900 }, { "epoch": 1.6231443205158205, "grad_norm": 1.8557581901550293, "learning_rate": 9.131914049851021e-06, "loss": 0.9099, "step": 3901 }, { "epoch": 1.623560304708421, "grad_norm": 1.7661263942718506, "learning_rate": 9.127450741278756e-06, "loss": 0.953, "step": 3902 }, { "epoch": 1.6239762889010216, "grad_norm": 1.6573749780654907, "learning_rate": 9.122987607854773e-06, "loss": 0.82, "step": 3903 }, { "epoch": 1.6243922730936224, "grad_norm": 1.9761757850646973, "learning_rate": 9.118524650474959e-06, "loss": 0.8312, "step": 3904 }, { "epoch": 1.6248082572862232, "grad_norm": 1.7276631593704224, "learning_rate": 9.114061870035179e-06, "loss": 0.7519, "step": 3905 }, { "epoch": 1.6252242414788238, "grad_norm": 1.6725953817367554, "learning_rate": 9.109599267431243e-06, "loss": 0.835, "step": 3906 }, { "epoch": 1.6256402256714244, "grad_norm": 1.8470449447631836, "learning_rate": 9.105136843558949e-06, "loss": 0.7614, "step": 3907 }, { "epoch": 1.6260562098640252, "grad_norm": 1.8138095140457153, "learning_rate": 9.100674599314042e-06, "loss": 0.8082, "step": 3908 }, { "epoch": 1.6264721940566258, "grad_norm": 1.774588704109192, "learning_rate": 9.09621253559223e-06, "loss": 0.8551, "step": 3909 }, { "epoch": 1.6268881782492266, "grad_norm": 1.5481780767440796, "learning_rate": 9.0917506532892e-06, "loss": 0.8792, "step": 3910 }, { "epoch": 1.6273041624418272, "grad_norm": 3.2365942001342773, "learning_rate": 9.087288953300589e-06, "loss": 0.8079, "step": 3911 }, { "epoch": 1.6277201466344278, "grad_norm": 1.7998318672180176, "learning_rate": 9.082827436522002e-06, "loss": 0.925, "step": 3912 }, { "epoch": 1.6281361308270286, "grad_norm": 1.7715404033660889, "learning_rate": 9.078366103849002e-06, "loss": 0.8597, "step": 3913 }, { "epoch": 1.6285521150196294, "grad_norm": 1.6700870990753174, "learning_rate": 9.073904956177128e-06, "loss": 0.9071, "step": 3914 }, { "epoch": 1.62896809921223, "grad_norm": 1.787583589553833, "learning_rate": 9.069443994401865e-06, "loss": 0.9509, "step": 3915 }, { "epoch": 1.6293840834048305, "grad_norm": 1.7476481199264526, "learning_rate": 9.06498321941868e-06, "loss": 0.807, "step": 3916 }, { "epoch": 1.6298000675974313, "grad_norm": 1.6539546251296997, "learning_rate": 9.060522632122981e-06, "loss": 0.9782, "step": 3917 }, { "epoch": 1.630216051790032, "grad_norm": 1.846384048461914, "learning_rate": 9.05606223341016e-06, "loss": 0.7912, "step": 3918 }, { "epoch": 1.6306320359826327, "grad_norm": 1.9320567846298218, "learning_rate": 9.051602024175547e-06, "loss": 0.7565, "step": 3919 }, { "epoch": 1.6310480201752333, "grad_norm": 2.2195448875427246, "learning_rate": 9.04714200531446e-06, "loss": 0.855, "step": 3920 }, { "epoch": 1.631464004367834, "grad_norm": 1.6288013458251953, "learning_rate": 9.042682177722156e-06, "loss": 0.8004, "step": 3921 }, { "epoch": 1.6318799885604347, "grad_norm": 162.74717712402344, "learning_rate": 9.03822254229387e-06, "loss": 0.8204, "step": 3922 }, { "epoch": 1.6322959727530355, "grad_norm": 1.8144088983535767, "learning_rate": 9.033763099924796e-06, "loss": 0.9415, "step": 3923 }, { "epoch": 1.632711956945636, "grad_norm": 1.6628966331481934, "learning_rate": 9.029303851510072e-06, "loss": 0.7054, "step": 3924 }, { "epoch": 1.6331279411382367, "grad_norm": 1.689650535583496, "learning_rate": 9.024844797944824e-06, "loss": 0.7831, "step": 3925 }, { "epoch": 1.6335439253308375, "grad_norm": 1.6174143552780151, "learning_rate": 9.020385940124118e-06, "loss": 0.7489, "step": 3926 }, { "epoch": 1.633959909523438, "grad_norm": 1.789175271987915, "learning_rate": 9.015927278942991e-06, "loss": 0.8273, "step": 3927 }, { "epoch": 1.6343758937160389, "grad_norm": 1.817177176475525, "learning_rate": 9.011468815296433e-06, "loss": 0.8118, "step": 3928 }, { "epoch": 1.6347918779086394, "grad_norm": 1.7784315347671509, "learning_rate": 9.007010550079406e-06, "loss": 0.9301, "step": 3929 }, { "epoch": 1.63520786210124, "grad_norm": 1.6492094993591309, "learning_rate": 9.002552484186817e-06, "loss": 0.6909, "step": 3930 }, { "epoch": 1.6356238462938408, "grad_norm": 11.822525978088379, "learning_rate": 8.99809461851355e-06, "loss": 0.8134, "step": 3931 }, { "epoch": 1.6360398304864416, "grad_norm": 1.900886058807373, "learning_rate": 8.993636953954433e-06, "loss": 0.758, "step": 3932 }, { "epoch": 1.6364558146790422, "grad_norm": 1.8854990005493164, "learning_rate": 8.989179491404265e-06, "loss": 0.8351, "step": 3933 }, { "epoch": 1.6368717988716428, "grad_norm": 1.8326959609985352, "learning_rate": 8.984722231757793e-06, "loss": 0.8835, "step": 3934 }, { "epoch": 1.6372877830642436, "grad_norm": 1.7207313776016235, "learning_rate": 8.98026517590974e-06, "loss": 0.811, "step": 3935 }, { "epoch": 1.6377037672568442, "grad_norm": 1.7430682182312012, "learning_rate": 8.97580832475477e-06, "loss": 0.8468, "step": 3936 }, { "epoch": 1.638119751449445, "grad_norm": 1.6225663423538208, "learning_rate": 8.971351679187515e-06, "loss": 0.8929, "step": 3937 }, { "epoch": 1.6385357356420456, "grad_norm": 1.8294315338134766, "learning_rate": 8.966895240102573e-06, "loss": 0.8394, "step": 3938 }, { "epoch": 1.6389517198346462, "grad_norm": 1.7860944271087646, "learning_rate": 8.962439008394482e-06, "loss": 0.8275, "step": 3939 }, { "epoch": 1.639367704027247, "grad_norm": 1.79617440700531, "learning_rate": 8.957982984957759e-06, "loss": 0.8902, "step": 3940 }, { "epoch": 1.6397836882198478, "grad_norm": 1.6741969585418701, "learning_rate": 8.95352717068686e-06, "loss": 0.8624, "step": 3941 }, { "epoch": 1.6401996724124484, "grad_norm": 1.6193205118179321, "learning_rate": 8.949071566476215e-06, "loss": 0.7897, "step": 3942 }, { "epoch": 1.640615656605049, "grad_norm": 1.6955136060714722, "learning_rate": 8.944616173220197e-06, "loss": 0.7764, "step": 3943 }, { "epoch": 1.6410316407976497, "grad_norm": 1.7704870700836182, "learning_rate": 8.940160991813157e-06, "loss": 0.825, "step": 3944 }, { "epoch": 1.6414476249902503, "grad_norm": 1.6854735612869263, "learning_rate": 8.935706023149374e-06, "loss": 0.8442, "step": 3945 }, { "epoch": 1.6418636091828511, "grad_norm": 1.720444917678833, "learning_rate": 8.931251268123119e-06, "loss": 0.8124, "step": 3946 }, { "epoch": 1.6422795933754517, "grad_norm": 1.8270517587661743, "learning_rate": 8.926796727628591e-06, "loss": 0.8367, "step": 3947 }, { "epoch": 1.6426955775680523, "grad_norm": 1.6934704780578613, "learning_rate": 8.922342402559963e-06, "loss": 0.853, "step": 3948 }, { "epoch": 1.643111561760653, "grad_norm": 1.8272143602371216, "learning_rate": 8.917888293811354e-06, "loss": 0.9878, "step": 3949 }, { "epoch": 1.643527545953254, "grad_norm": 2082.802490234375, "learning_rate": 8.91343440227685e-06, "loss": 0.776, "step": 3950 }, { "epoch": 1.6439435301458545, "grad_norm": 1.8610535860061646, "learning_rate": 8.908980728850483e-06, "loss": 0.9046, "step": 3951 }, { "epoch": 1.644359514338455, "grad_norm": 33.11808776855469, "learning_rate": 8.904527274426249e-06, "loss": 0.8187, "step": 3952 }, { "epoch": 1.6447754985310559, "grad_norm": 1.6101593971252441, "learning_rate": 8.900074039898101e-06, "loss": 0.7186, "step": 3953 }, { "epoch": 1.6451914827236565, "grad_norm": 1.5971523523330688, "learning_rate": 8.895621026159937e-06, "loss": 0.8077, "step": 3954 }, { "epoch": 1.6456074669162573, "grad_norm": 2.020660161972046, "learning_rate": 8.891168234105625e-06, "loss": 0.9248, "step": 3955 }, { "epoch": 1.6460234511088578, "grad_norm": 1.802403211593628, "learning_rate": 8.886715664628978e-06, "loss": 0.8883, "step": 3956 }, { "epoch": 1.6464394353014584, "grad_norm": 1.703928828239441, "learning_rate": 8.88226331862377e-06, "loss": 0.7793, "step": 3957 }, { "epoch": 1.6468554194940592, "grad_norm": 1.7434816360473633, "learning_rate": 8.87781119698372e-06, "loss": 0.8265, "step": 3958 }, { "epoch": 1.64727140368666, "grad_norm": 1.717177152633667, "learning_rate": 8.87335930060252e-06, "loss": 0.8481, "step": 3959 }, { "epoch": 1.6476873878792606, "grad_norm": 1.7555211782455444, "learning_rate": 8.868907630373798e-06, "loss": 0.9089, "step": 3960 }, { "epoch": 1.6481033720718612, "grad_norm": 1.7018373012542725, "learning_rate": 8.864456187191155e-06, "loss": 0.8196, "step": 3961 }, { "epoch": 1.648519356264462, "grad_norm": 1.8756787776947021, "learning_rate": 8.860004971948129e-06, "loss": 0.8778, "step": 3962 }, { "epoch": 1.6489353404570626, "grad_norm": 1.7292473316192627, "learning_rate": 8.855553985538225e-06, "loss": 0.7771, "step": 3963 }, { "epoch": 1.6493513246496634, "grad_norm": 1.769152283668518, "learning_rate": 8.851103228854888e-06, "loss": 0.9278, "step": 3964 }, { "epoch": 1.649767308842264, "grad_norm": 1.6792140007019043, "learning_rate": 8.846652702791537e-06, "loss": 0.7082, "step": 3965 }, { "epoch": 1.6501832930348646, "grad_norm": 1.6919655799865723, "learning_rate": 8.842202408241527e-06, "loss": 0.7268, "step": 3966 }, { "epoch": 1.6505992772274654, "grad_norm": 6.933046340942383, "learning_rate": 8.837752346098173e-06, "loss": 0.825, "step": 3967 }, { "epoch": 1.6510152614200662, "grad_norm": 1.7186158895492554, "learning_rate": 8.83330251725475e-06, "loss": 0.9381, "step": 3968 }, { "epoch": 1.6514312456126667, "grad_norm": 1.6173617839813232, "learning_rate": 8.828852922604468e-06, "loss": 0.8476, "step": 3969 }, { "epoch": 1.6518472298052673, "grad_norm": 3.445258855819702, "learning_rate": 8.824403563040514e-06, "loss": 0.7018, "step": 3970 }, { "epoch": 1.6522632139978681, "grad_norm": 1.687713623046875, "learning_rate": 8.819954439456008e-06, "loss": 0.7789, "step": 3971 }, { "epoch": 1.6526791981904687, "grad_norm": 1.7846449613571167, "learning_rate": 8.815505552744034e-06, "loss": 0.8961, "step": 3972 }, { "epoch": 1.6530951823830695, "grad_norm": 1.6565310955047607, "learning_rate": 8.81105690379762e-06, "loss": 0.8039, "step": 3973 }, { "epoch": 1.65351116657567, "grad_norm": 1.5814924240112305, "learning_rate": 8.806608493509757e-06, "loss": 0.8837, "step": 3974 }, { "epoch": 1.6539271507682707, "grad_norm": 1.801269769668579, "learning_rate": 8.802160322773372e-06, "loss": 0.7849, "step": 3975 }, { "epoch": 1.6543431349608715, "grad_norm": 296.9795837402344, "learning_rate": 8.797712392481364e-06, "loss": 0.826, "step": 3976 }, { "epoch": 1.6547591191534723, "grad_norm": 1.8072645664215088, "learning_rate": 8.79326470352657e-06, "loss": 0.872, "step": 3977 }, { "epoch": 1.6551751033460729, "grad_norm": 1.708678960800171, "learning_rate": 8.78881725680178e-06, "loss": 0.7725, "step": 3978 }, { "epoch": 1.6555910875386735, "grad_norm": 1.7838319540023804, "learning_rate": 8.784370053199737e-06, "loss": 0.8766, "step": 3979 }, { "epoch": 1.6560070717312743, "grad_norm": 1.7904008626937866, "learning_rate": 8.779923093613143e-06, "loss": 0.7946, "step": 3980 }, { "epoch": 1.6564230559238748, "grad_norm": 1.7333093881607056, "learning_rate": 8.775476378934635e-06, "loss": 0.6879, "step": 3981 }, { "epoch": 1.6568390401164756, "grad_norm": 1.7063347101211548, "learning_rate": 8.77102991005681e-06, "loss": 0.7046, "step": 3982 }, { "epoch": 1.6572550243090762, "grad_norm": 1.756068468093872, "learning_rate": 8.766583687872223e-06, "loss": 0.8178, "step": 3983 }, { "epoch": 1.6576710085016768, "grad_norm": 1.6464788913726807, "learning_rate": 8.762137713273362e-06, "loss": 0.7468, "step": 3984 }, { "epoch": 1.6580869926942776, "grad_norm": 19.74749183654785, "learning_rate": 8.757691987152684e-06, "loss": 0.8936, "step": 3985 }, { "epoch": 1.6585029768868784, "grad_norm": 1.8033429384231567, "learning_rate": 8.753246510402583e-06, "loss": 0.7363, "step": 3986 }, { "epoch": 1.658918961079479, "grad_norm": 19.86508560180664, "learning_rate": 8.748801283915406e-06, "loss": 0.7429, "step": 3987 }, { "epoch": 1.6593349452720796, "grad_norm": 1.9501053094863892, "learning_rate": 8.744356308583449e-06, "loss": 0.953, "step": 3988 }, { "epoch": 1.6597509294646804, "grad_norm": 1.6384589672088623, "learning_rate": 8.739911585298963e-06, "loss": 0.772, "step": 3989 }, { "epoch": 1.660166913657281, "grad_norm": 1.8299247026443481, "learning_rate": 8.735467114954144e-06, "loss": 0.8118, "step": 3990 }, { "epoch": 1.6605828978498818, "grad_norm": 1.683194875717163, "learning_rate": 8.73102289844114e-06, "loss": 0.7994, "step": 3991 }, { "epoch": 1.6609988820424824, "grad_norm": 1.707252860069275, "learning_rate": 8.726578936652039e-06, "loss": 0.8209, "step": 3992 }, { "epoch": 1.661414866235083, "grad_norm": 1.7551132440567017, "learning_rate": 8.722135230478895e-06, "loss": 0.8429, "step": 3993 }, { "epoch": 1.6618308504276837, "grad_norm": 7.833304405212402, "learning_rate": 8.71769178081369e-06, "loss": 0.8122, "step": 3994 }, { "epoch": 1.6622468346202846, "grad_norm": 1.6210803985595703, "learning_rate": 8.713248588548375e-06, "loss": 0.8158, "step": 3995 }, { "epoch": 1.6626628188128851, "grad_norm": 1.8709825277328491, "learning_rate": 8.708805654574833e-06, "loss": 0.87, "step": 3996 }, { "epoch": 1.6630788030054857, "grad_norm": 1.6435250043869019, "learning_rate": 8.7043629797849e-06, "loss": 0.8448, "step": 3997 }, { "epoch": 1.6634947871980865, "grad_norm": 1.7071179151535034, "learning_rate": 8.69992056507037e-06, "loss": 0.7339, "step": 3998 }, { "epoch": 1.663910771390687, "grad_norm": 1.727984070777893, "learning_rate": 8.695478411322967e-06, "loss": 0.8513, "step": 3999 }, { "epoch": 1.664326755583288, "grad_norm": 2.3940258026123047, "learning_rate": 8.691036519434382e-06, "loss": 0.7722, "step": 4000 }, { "epoch": 1.664326755583288, "eval_loss": 0.7698240876197815, "eval_runtime": 1877.0401, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.756, "step": 4000 }, { "epoch": 1.6647427397758885, "grad_norm": 1.7489268779754639, "learning_rate": 8.686594890296234e-06, "loss": 0.8807, "step": 4001 }, { "epoch": 1.665158723968489, "grad_norm": 2.4991061687469482, "learning_rate": 8.682153524800106e-06, "loss": 0.8379, "step": 4002 }, { "epoch": 1.6655747081610899, "grad_norm": 1.8180549144744873, "learning_rate": 8.677712423837512e-06, "loss": 0.8068, "step": 4003 }, { "epoch": 1.6659906923536907, "grad_norm": 1.8703563213348389, "learning_rate": 8.673271588299931e-06, "loss": 0.8391, "step": 4004 }, { "epoch": 1.6664066765462913, "grad_norm": 1.704405426979065, "learning_rate": 8.668831019078776e-06, "loss": 0.7925, "step": 4005 }, { "epoch": 1.6668226607388918, "grad_norm": 2.3442323207855225, "learning_rate": 8.66439071706541e-06, "loss": 0.8342, "step": 4006 }, { "epoch": 1.6672386449314927, "grad_norm": 1.8648210763931274, "learning_rate": 8.659950683151135e-06, "loss": 0.9129, "step": 4007 }, { "epoch": 1.6676546291240932, "grad_norm": 1.770129680633545, "learning_rate": 8.65551091822722e-06, "loss": 0.8403, "step": 4008 }, { "epoch": 1.668070613316694, "grad_norm": 1.5918816328048706, "learning_rate": 8.651071423184855e-06, "loss": 0.7439, "step": 4009 }, { "epoch": 1.6684865975092946, "grad_norm": 1.7273918390274048, "learning_rate": 8.646632198915195e-06, "loss": 0.8211, "step": 4010 }, { "epoch": 1.6689025817018952, "grad_norm": 1.8395662307739258, "learning_rate": 8.642193246309328e-06, "loss": 0.8162, "step": 4011 }, { "epoch": 1.669318565894496, "grad_norm": 1.7123959064483643, "learning_rate": 8.637754566258291e-06, "loss": 0.7081, "step": 4012 }, { "epoch": 1.6697345500870968, "grad_norm": 1.9496411085128784, "learning_rate": 8.633316159653077e-06, "loss": 0.8388, "step": 4013 }, { "epoch": 1.6701505342796974, "grad_norm": 1.6094763278961182, "learning_rate": 8.628878027384607e-06, "loss": 0.6747, "step": 4014 }, { "epoch": 1.670566518472298, "grad_norm": 1.6745692491531372, "learning_rate": 8.624440170343758e-06, "loss": 0.7694, "step": 4015 }, { "epoch": 1.6709825026648988, "grad_norm": 1.7257155179977417, "learning_rate": 8.620002589421342e-06, "loss": 0.8052, "step": 4016 }, { "epoch": 1.6713984868574994, "grad_norm": 1.5870774984359741, "learning_rate": 8.615565285508132e-06, "loss": 0.786, "step": 4017 }, { "epoch": 1.6718144710501002, "grad_norm": 1.811335802078247, "learning_rate": 8.611128259494826e-06, "loss": 0.7639, "step": 4018 }, { "epoch": 1.6722304552427008, "grad_norm": 1.916115164756775, "learning_rate": 8.606691512272085e-06, "loss": 0.8218, "step": 4019 }, { "epoch": 1.6726464394353013, "grad_norm": 1.6387555599212646, "learning_rate": 8.602255044730498e-06, "loss": 0.8051, "step": 4020 }, { "epoch": 1.6730624236279021, "grad_norm": 3.1987318992614746, "learning_rate": 8.597818857760608e-06, "loss": 0.7408, "step": 4021 }, { "epoch": 1.673478407820503, "grad_norm": 2.1562623977661133, "learning_rate": 8.593382952252893e-06, "loss": 0.9069, "step": 4022 }, { "epoch": 1.6738943920131035, "grad_norm": 1.7358263731002808, "learning_rate": 8.588947329097788e-06, "loss": 0.8452, "step": 4023 }, { "epoch": 1.674310376205704, "grad_norm": 1.591935396194458, "learning_rate": 8.584511989185654e-06, "loss": 0.7215, "step": 4024 }, { "epoch": 1.674726360398305, "grad_norm": 1.6118773221969604, "learning_rate": 8.580076933406813e-06, "loss": 0.9016, "step": 4025 }, { "epoch": 1.6751423445909055, "grad_norm": 1.6140061616897583, "learning_rate": 8.575642162651517e-06, "loss": 0.7582, "step": 4026 }, { "epoch": 1.6755583287835063, "grad_norm": 1.6502459049224854, "learning_rate": 8.571207677809965e-06, "loss": 0.8389, "step": 4027 }, { "epoch": 1.6759743129761069, "grad_norm": 1.7432451248168945, "learning_rate": 8.566773479772296e-06, "loss": 0.8442, "step": 4028 }, { "epoch": 1.6763902971687075, "grad_norm": 1.7676010131835938, "learning_rate": 8.562339569428598e-06, "loss": 0.7966, "step": 4029 }, { "epoch": 1.6768062813613083, "grad_norm": 1.7822251319885254, "learning_rate": 8.557905947668898e-06, "loss": 0.7004, "step": 4030 }, { "epoch": 1.677222265553909, "grad_norm": 1.8043791055679321, "learning_rate": 8.553472615383159e-06, "loss": 0.8867, "step": 4031 }, { "epoch": 1.6776382497465097, "grad_norm": 1.8458670377731323, "learning_rate": 8.5490395734613e-06, "loss": 0.8373, "step": 4032 }, { "epoch": 1.6780542339391102, "grad_norm": 1.7130869626998901, "learning_rate": 8.54460682279316e-06, "loss": 0.7764, "step": 4033 }, { "epoch": 1.678470218131711, "grad_norm": 1.818289041519165, "learning_rate": 8.540174364268547e-06, "loss": 0.7169, "step": 4034 }, { "epoch": 1.6788862023243116, "grad_norm": 1.851591944694519, "learning_rate": 8.535742198777188e-06, "loss": 0.8999, "step": 4035 }, { "epoch": 1.6793021865169124, "grad_norm": 1.7230031490325928, "learning_rate": 8.53131032720876e-06, "loss": 0.7703, "step": 4036 }, { "epoch": 1.679718170709513, "grad_norm": 2.9559059143066406, "learning_rate": 8.526878750452876e-06, "loss": 0.8696, "step": 4037 }, { "epoch": 1.6801341549021136, "grad_norm": 1.8848764896392822, "learning_rate": 8.522447469399102e-06, "loss": 0.8728, "step": 4038 }, { "epoch": 1.6805501390947144, "grad_norm": 1.8139477968215942, "learning_rate": 8.518016484936929e-06, "loss": 0.8164, "step": 4039 }, { "epoch": 1.6809661232873152, "grad_norm": 1.6758215427398682, "learning_rate": 8.513585797955804e-06, "loss": 0.8017, "step": 4040 }, { "epoch": 1.6813821074799158, "grad_norm": 1.770059585571289, "learning_rate": 8.509155409345097e-06, "loss": 0.7936, "step": 4041 }, { "epoch": 1.6817980916725164, "grad_norm": 1.6133556365966797, "learning_rate": 8.504725319994136e-06, "loss": 0.6764, "step": 4042 }, { "epoch": 1.6822140758651172, "grad_norm": 1.6965467929840088, "learning_rate": 8.500295530792173e-06, "loss": 0.9369, "step": 4043 }, { "epoch": 1.6826300600577178, "grad_norm": 1.8010367155075073, "learning_rate": 8.495866042628412e-06, "loss": 0.9056, "step": 4044 }, { "epoch": 1.6830460442503186, "grad_norm": 1.6741318702697754, "learning_rate": 8.49143685639199e-06, "loss": 0.771, "step": 4045 }, { "epoch": 1.6834620284429191, "grad_norm": 1.7550945281982422, "learning_rate": 8.487007972971982e-06, "loss": 0.8585, "step": 4046 }, { "epoch": 1.6838780126355197, "grad_norm": 1.7695475816726685, "learning_rate": 8.482579393257412e-06, "loss": 0.8574, "step": 4047 }, { "epoch": 1.6842939968281205, "grad_norm": 1.7588311433792114, "learning_rate": 8.47815111813723e-06, "loss": 0.826, "step": 4048 }, { "epoch": 1.6847099810207213, "grad_norm": 1.7579982280731201, "learning_rate": 8.473723148500335e-06, "loss": 0.9172, "step": 4049 }, { "epoch": 1.685125965213322, "grad_norm": 1.5130703449249268, "learning_rate": 8.46929548523556e-06, "loss": 0.7449, "step": 4050 }, { "epoch": 1.6855419494059225, "grad_norm": 1.6304173469543457, "learning_rate": 8.464868129231676e-06, "loss": 0.8971, "step": 4051 }, { "epoch": 1.6859579335985233, "grad_norm": 1.7397769689559937, "learning_rate": 8.460441081377391e-06, "loss": 0.721, "step": 4052 }, { "epoch": 1.6863739177911239, "grad_norm": 1.7722827196121216, "learning_rate": 8.456014342561362e-06, "loss": 0.7994, "step": 4053 }, { "epoch": 1.6867899019837247, "grad_norm": 1.7245006561279297, "learning_rate": 8.451587913672168e-06, "loss": 0.8097, "step": 4054 }, { "epoch": 1.6872058861763253, "grad_norm": 1.6995552778244019, "learning_rate": 8.447161795598335e-06, "loss": 0.8658, "step": 4055 }, { "epoch": 1.6876218703689259, "grad_norm": 1.7845336198806763, "learning_rate": 8.442735989228328e-06, "loss": 0.8937, "step": 4056 }, { "epoch": 1.6880378545615267, "grad_norm": 1.8225178718566895, "learning_rate": 8.438310495450548e-06, "loss": 0.8527, "step": 4057 }, { "epoch": 1.6884538387541275, "grad_norm": 1.889350175857544, "learning_rate": 8.43388531515332e-06, "loss": 0.8658, "step": 4058 }, { "epoch": 1.688869822946728, "grad_norm": 1.74077570438385, "learning_rate": 8.429460449224933e-06, "loss": 0.754, "step": 4059 }, { "epoch": 1.6892858071393286, "grad_norm": 1.8899641036987305, "learning_rate": 8.425035898553591e-06, "loss": 0.7313, "step": 4060 }, { "epoch": 1.6897017913319294, "grad_norm": 9.697924613952637, "learning_rate": 8.420611664027439e-06, "loss": 0.8587, "step": 4061 }, { "epoch": 1.69011777552453, "grad_norm": 1.8089430332183838, "learning_rate": 8.416187746534569e-06, "loss": 0.8211, "step": 4062 }, { "epoch": 1.6905337597171308, "grad_norm": 1.7806117534637451, "learning_rate": 8.41176414696299e-06, "loss": 0.9052, "step": 4063 }, { "epoch": 1.6909497439097314, "grad_norm": 215.60061645507812, "learning_rate": 8.40734086620067e-06, "loss": 0.697, "step": 4064 }, { "epoch": 1.691365728102332, "grad_norm": 1.7360808849334717, "learning_rate": 8.402917905135494e-06, "loss": 0.7882, "step": 4065 }, { "epoch": 1.6917817122949328, "grad_norm": 1.7919551134109497, "learning_rate": 8.398495264655296e-06, "loss": 0.8234, "step": 4066 }, { "epoch": 1.6921976964875336, "grad_norm": 1.9791431427001953, "learning_rate": 8.394072945647831e-06, "loss": 0.8057, "step": 4067 }, { "epoch": 1.6926136806801342, "grad_norm": 1.9099915027618408, "learning_rate": 8.38965094900081e-06, "loss": 0.8707, "step": 4068 }, { "epoch": 1.6930296648727348, "grad_norm": 3.3283815383911133, "learning_rate": 8.385229275601861e-06, "loss": 0.866, "step": 4069 }, { "epoch": 1.6934456490653356, "grad_norm": 1.7634867429733276, "learning_rate": 8.380807926338555e-06, "loss": 0.7928, "step": 4070 }, { "epoch": 1.6938616332579362, "grad_norm": 1.7030550241470337, "learning_rate": 8.376386902098395e-06, "loss": 0.8731, "step": 4071 }, { "epoch": 1.694277617450537, "grad_norm": 1.8601270914077759, "learning_rate": 8.371966203768825e-06, "loss": 0.8081, "step": 4072 }, { "epoch": 1.6946936016431375, "grad_norm": 1.7311406135559082, "learning_rate": 8.367545832237213e-06, "loss": 0.8549, "step": 4073 }, { "epoch": 1.6951095858357381, "grad_norm": 1.6237000226974487, "learning_rate": 8.363125788390876e-06, "loss": 0.7895, "step": 4074 }, { "epoch": 1.695525570028339, "grad_norm": 1.6921099424362183, "learning_rate": 8.358706073117053e-06, "loss": 0.7278, "step": 4075 }, { "epoch": 1.6959415542209397, "grad_norm": 1.8178350925445557, "learning_rate": 8.354286687302915e-06, "loss": 0.89, "step": 4076 }, { "epoch": 1.6963575384135403, "grad_norm": 1.7708508968353271, "learning_rate": 8.349867631835583e-06, "loss": 0.8301, "step": 4077 }, { "epoch": 1.696773522606141, "grad_norm": 1.7407896518707275, "learning_rate": 8.345448907602092e-06, "loss": 0.7515, "step": 4078 }, { "epoch": 1.6971895067987417, "grad_norm": 1.849577784538269, "learning_rate": 8.34103051548943e-06, "loss": 0.7947, "step": 4079 }, { "epoch": 1.6976054909913423, "grad_norm": 1.7165846824645996, "learning_rate": 8.3366124563845e-06, "loss": 0.8119, "step": 4080 }, { "epoch": 1.698021475183943, "grad_norm": 1668.63232421875, "learning_rate": 8.332194731174151e-06, "loss": 0.866, "step": 4081 }, { "epoch": 1.6984374593765437, "grad_norm": 1.5754573345184326, "learning_rate": 8.327777340745153e-06, "loss": 0.803, "step": 4082 }, { "epoch": 1.6988534435691443, "grad_norm": 1.573378086090088, "learning_rate": 8.323360285984228e-06, "loss": 0.7892, "step": 4083 }, { "epoch": 1.699269427761745, "grad_norm": 1.8189525604248047, "learning_rate": 8.318943567778008e-06, "loss": 0.8464, "step": 4084 }, { "epoch": 1.6996854119543459, "grad_norm": 2.1089258193969727, "learning_rate": 8.314527187013078e-06, "loss": 0.7816, "step": 4085 }, { "epoch": 1.7001013961469464, "grad_norm": 1.5363818407058716, "learning_rate": 8.310111144575934e-06, "loss": 0.7826, "step": 4086 }, { "epoch": 1.700517380339547, "grad_norm": 1.9439325332641602, "learning_rate": 8.305695441353026e-06, "loss": 0.9926, "step": 4087 }, { "epoch": 1.7009333645321478, "grad_norm": 1.7259858846664429, "learning_rate": 8.30128007823072e-06, "loss": 0.8742, "step": 4088 }, { "epoch": 1.7013493487247484, "grad_norm": 1.713486671447754, "learning_rate": 8.29686505609532e-06, "loss": 0.7493, "step": 4089 }, { "epoch": 1.7017653329173492, "grad_norm": 1.6939167976379395, "learning_rate": 8.292450375833066e-06, "loss": 0.8223, "step": 4090 }, { "epoch": 1.7021813171099498, "grad_norm": 1.515265941619873, "learning_rate": 8.288036038330112e-06, "loss": 0.6774, "step": 4091 }, { "epoch": 1.7025973013025504, "grad_norm": 1.639531135559082, "learning_rate": 8.283622044472572e-06, "loss": 0.8197, "step": 4092 }, { "epoch": 1.7030132854951512, "grad_norm": 1.5852446556091309, "learning_rate": 8.27920839514646e-06, "loss": 0.7905, "step": 4093 }, { "epoch": 1.703429269687752, "grad_norm": 1.6388204097747803, "learning_rate": 8.274795091237744e-06, "loss": 0.8316, "step": 4094 }, { "epoch": 1.7038452538803526, "grad_norm": 3.2971527576446533, "learning_rate": 8.270382133632307e-06, "loss": 0.7827, "step": 4095 }, { "epoch": 1.7042612380729532, "grad_norm": 1.8117791414260864, "learning_rate": 8.265969523215977e-06, "loss": 0.8061, "step": 4096 }, { "epoch": 1.704677222265554, "grad_norm": 1.723633885383606, "learning_rate": 8.261557260874497e-06, "loss": 0.8872, "step": 4097 }, { "epoch": 1.7050932064581545, "grad_norm": 1.6483960151672363, "learning_rate": 8.257145347493556e-06, "loss": 0.776, "step": 4098 }, { "epoch": 1.7055091906507553, "grad_norm": 1.6981357336044312, "learning_rate": 8.25273378395876e-06, "loss": 0.8057, "step": 4099 }, { "epoch": 1.705925174843356, "grad_norm": 2.0019326210021973, "learning_rate": 8.248322571155648e-06, "loss": 0.8809, "step": 4100 }, { "epoch": 1.7063411590359565, "grad_norm": 1.830694317817688, "learning_rate": 8.243911709969692e-06, "loss": 0.763, "step": 4101 }, { "epoch": 1.7067571432285573, "grad_norm": 1.7507808208465576, "learning_rate": 8.239501201286295e-06, "loss": 0.7263, "step": 4102 }, { "epoch": 1.7071731274211581, "grad_norm": 1.737484335899353, "learning_rate": 8.235091045990778e-06, "loss": 0.9247, "step": 4103 }, { "epoch": 1.7075891116137587, "grad_norm": 1.8100577592849731, "learning_rate": 8.230681244968405e-06, "loss": 0.8988, "step": 4104 }, { "epoch": 1.7080050958063593, "grad_norm": 1.7543234825134277, "learning_rate": 8.226271799104364e-06, "loss": 0.855, "step": 4105 }, { "epoch": 1.70842107999896, "grad_norm": 1.789711594581604, "learning_rate": 8.221862709283765e-06, "loss": 0.9338, "step": 4106 }, { "epoch": 1.7088370641915607, "grad_norm": 1.7705649137496948, "learning_rate": 8.217453976391657e-06, "loss": 0.6937, "step": 4107 }, { "epoch": 1.7092530483841615, "grad_norm": 1.8833422660827637, "learning_rate": 8.21304560131301e-06, "loss": 0.9251, "step": 4108 }, { "epoch": 1.709669032576762, "grad_norm": 1.6917365789413452, "learning_rate": 8.208637584932724e-06, "loss": 0.7804, "step": 4109 }, { "epoch": 1.7100850167693626, "grad_norm": 1.804492473602295, "learning_rate": 8.204229928135626e-06, "loss": 0.8281, "step": 4110 }, { "epoch": 1.7105010009619634, "grad_norm": 1.7612367868423462, "learning_rate": 8.199822631806479e-06, "loss": 0.8416, "step": 4111 }, { "epoch": 1.7109169851545643, "grad_norm": 1.7098987102508545, "learning_rate": 8.195415696829957e-06, "loss": 0.8833, "step": 4112 }, { "epoch": 1.7113329693471648, "grad_norm": 1.7281477451324463, "learning_rate": 8.191009124090684e-06, "loss": 0.7947, "step": 4113 }, { "epoch": 1.7117489535397654, "grad_norm": 2.0321671962738037, "learning_rate": 8.186602914473188e-06, "loss": 0.976, "step": 4114 }, { "epoch": 1.7121649377323662, "grad_norm": 1.8308098316192627, "learning_rate": 8.182197068861942e-06, "loss": 0.8517, "step": 4115 }, { "epoch": 1.7125809219249668, "grad_norm": 1.767424464225769, "learning_rate": 8.17779158814133e-06, "loss": 0.8841, "step": 4116 }, { "epoch": 1.7129969061175676, "grad_norm": 1.7606009244918823, "learning_rate": 8.173386473195681e-06, "loss": 0.8423, "step": 4117 }, { "epoch": 1.7134128903101682, "grad_norm": 1.7396330833435059, "learning_rate": 8.168981724909233e-06, "loss": 0.8747, "step": 4118 }, { "epoch": 1.7138288745027688, "grad_norm": 1.6871978044509888, "learning_rate": 8.164577344166166e-06, "loss": 0.8012, "step": 4119 }, { "epoch": 1.7142448586953696, "grad_norm": 1.799464464187622, "learning_rate": 8.160173331850578e-06, "loss": 0.832, "step": 4120 }, { "epoch": 1.7146608428879704, "grad_norm": 1.6548398733139038, "learning_rate": 8.155769688846486e-06, "loss": 0.7548, "step": 4121 }, { "epoch": 1.715076827080571, "grad_norm": 1.7055673599243164, "learning_rate": 8.151366416037852e-06, "loss": 0.6941, "step": 4122 }, { "epoch": 1.7154928112731715, "grad_norm": 1.826789379119873, "learning_rate": 8.146963514308542e-06, "loss": 0.9507, "step": 4123 }, { "epoch": 1.7159087954657724, "grad_norm": 1.8808807134628296, "learning_rate": 8.142560984542364e-06, "loss": 0.9281, "step": 4124 }, { "epoch": 1.716324779658373, "grad_norm": 1.7178022861480713, "learning_rate": 8.138158827623041e-06, "loss": 0.7968, "step": 4125 }, { "epoch": 1.7167407638509737, "grad_norm": 1.7657543420791626, "learning_rate": 8.133757044434231e-06, "loss": 0.7071, "step": 4126 }, { "epoch": 1.7171567480435743, "grad_norm": 1.7344998121261597, "learning_rate": 8.129355635859504e-06, "loss": 0.7979, "step": 4127 }, { "epoch": 1.717572732236175, "grad_norm": 1.7678766250610352, "learning_rate": 8.124954602782371e-06, "loss": 0.8154, "step": 4128 }, { "epoch": 1.7179887164287757, "grad_norm": 1.7511032819747925, "learning_rate": 8.120553946086252e-06, "loss": 0.7854, "step": 4129 }, { "epoch": 1.7184047006213765, "grad_norm": 1.7171931266784668, "learning_rate": 8.116153666654501e-06, "loss": 0.8582, "step": 4130 }, { "epoch": 1.718820684813977, "grad_norm": 1.814544916152954, "learning_rate": 8.11175376537039e-06, "loss": 0.8716, "step": 4131 }, { "epoch": 1.7192366690065777, "grad_norm": 1.7820844650268555, "learning_rate": 8.107354243117125e-06, "loss": 0.9231, "step": 4132 }, { "epoch": 1.7196526531991785, "grad_norm": 1.9754719734191895, "learning_rate": 8.102955100777822e-06, "loss": 0.8072, "step": 4133 }, { "epoch": 1.720068637391779, "grad_norm": 1.705312967300415, "learning_rate": 8.098556339235532e-06, "loss": 0.9138, "step": 4134 }, { "epoch": 1.7204846215843799, "grad_norm": 1.692354440689087, "learning_rate": 8.094157959373232e-06, "loss": 0.8428, "step": 4135 }, { "epoch": 1.7209006057769805, "grad_norm": 1.7809308767318726, "learning_rate": 8.089759962073802e-06, "loss": 0.8074, "step": 4136 }, { "epoch": 1.721316589969581, "grad_norm": 1.7797455787658691, "learning_rate": 8.085362348220074e-06, "loss": 0.8137, "step": 4137 }, { "epoch": 1.7217325741621818, "grad_norm": 1.785284399986267, "learning_rate": 8.08096511869478e-06, "loss": 0.8028, "step": 4138 }, { "epoch": 1.7221485583547826, "grad_norm": 1.8367961645126343, "learning_rate": 8.076568274380584e-06, "loss": 0.8333, "step": 4139 }, { "epoch": 1.7225645425473832, "grad_norm": 1.7077080011367798, "learning_rate": 8.072171816160073e-06, "loss": 0.827, "step": 4140 }, { "epoch": 1.7229805267399838, "grad_norm": 1.8003333806991577, "learning_rate": 8.067775744915757e-06, "loss": 0.8367, "step": 4141 }, { "epoch": 1.7233965109325846, "grad_norm": 1.7398607730865479, "learning_rate": 8.063380061530062e-06, "loss": 0.9149, "step": 4142 }, { "epoch": 1.7238124951251852, "grad_norm": 1.8705781698226929, "learning_rate": 8.058984766885348e-06, "loss": 0.8144, "step": 4143 }, { "epoch": 1.724228479317786, "grad_norm": 1.6719529628753662, "learning_rate": 8.054589861863885e-06, "loss": 0.7799, "step": 4144 }, { "epoch": 1.7246444635103866, "grad_norm": 2.025888204574585, "learning_rate": 8.050195347347872e-06, "loss": 1.0283, "step": 4145 }, { "epoch": 1.7250604477029872, "grad_norm": 1.8547691106796265, "learning_rate": 8.045801224219423e-06, "loss": 0.8498, "step": 4146 }, { "epoch": 1.725476431895588, "grad_norm": 1.765761137008667, "learning_rate": 8.041407493360584e-06, "loss": 0.9245, "step": 4147 }, { "epoch": 1.7258924160881888, "grad_norm": 1.8606457710266113, "learning_rate": 8.037014155653312e-06, "loss": 1.0935, "step": 4148 }, { "epoch": 1.7263084002807894, "grad_norm": 1.8998550176620483, "learning_rate": 8.03262121197949e-06, "loss": 0.899, "step": 4149 }, { "epoch": 1.72672438447339, "grad_norm": 1.9052393436431885, "learning_rate": 8.028228663220918e-06, "loss": 0.7945, "step": 4150 }, { "epoch": 1.7271403686659907, "grad_norm": 1.6347508430480957, "learning_rate": 8.023836510259322e-06, "loss": 0.7127, "step": 4151 }, { "epoch": 1.7275563528585913, "grad_norm": 93.68359375, "learning_rate": 8.019444753976353e-06, "loss": 0.7841, "step": 4152 }, { "epoch": 1.7279723370511921, "grad_norm": 30.990177154541016, "learning_rate": 8.015053395253565e-06, "loss": 0.8325, "step": 4153 }, { "epoch": 1.7283883212437927, "grad_norm": 1.6862088441848755, "learning_rate": 8.010662434972449e-06, "loss": 0.7368, "step": 4154 }, { "epoch": 1.7288043054363933, "grad_norm": 1.9293245077133179, "learning_rate": 8.006271874014405e-06, "loss": 0.8095, "step": 4155 }, { "epoch": 1.729220289628994, "grad_norm": 1.8450968265533447, "learning_rate": 8.001881713260765e-06, "loss": 0.856, "step": 4156 }, { "epoch": 1.729636273821595, "grad_norm": 1.9082164764404297, "learning_rate": 7.997491953592765e-06, "loss": 0.8239, "step": 4157 }, { "epoch": 1.7300522580141955, "grad_norm": 1.7968590259552002, "learning_rate": 7.993102595891578e-06, "loss": 0.8795, "step": 4158 }, { "epoch": 1.730468242206796, "grad_norm": 1.7152966260910034, "learning_rate": 7.988713641038278e-06, "loss": 0.8021, "step": 4159 }, { "epoch": 1.7308842263993969, "grad_norm": 1.7127933502197266, "learning_rate": 7.984325089913875e-06, "loss": 0.8467, "step": 4160 }, { "epoch": 1.7313002105919975, "grad_norm": 1.7572319507598877, "learning_rate": 7.97993694339928e-06, "loss": 0.8141, "step": 4161 }, { "epoch": 1.7317161947845983, "grad_norm": 1.8504570722579956, "learning_rate": 7.975549202375347e-06, "loss": 0.8575, "step": 4162 }, { "epoch": 1.7321321789771988, "grad_norm": 1.6760259866714478, "learning_rate": 7.971161867722824e-06, "loss": 0.9089, "step": 4163 }, { "epoch": 1.7325481631697994, "grad_norm": 1.9349960088729858, "learning_rate": 7.966774940322393e-06, "loss": 0.9333, "step": 4164 }, { "epoch": 1.7329641473624002, "grad_norm": 1.743201732635498, "learning_rate": 7.962388421054646e-06, "loss": 0.8496, "step": 4165 }, { "epoch": 1.733380131555001, "grad_norm": 1.7057464122772217, "learning_rate": 7.958002310800095e-06, "loss": 0.8116, "step": 4166 }, { "epoch": 1.7337961157476016, "grad_norm": 1.754425287246704, "learning_rate": 7.95361661043918e-06, "loss": 0.7203, "step": 4167 }, { "epoch": 1.7342120999402022, "grad_norm": 1.7063441276550293, "learning_rate": 7.949231320852242e-06, "loss": 0.8003, "step": 4168 }, { "epoch": 1.734628084132803, "grad_norm": 1.7392882108688354, "learning_rate": 7.944846442919552e-06, "loss": 0.6907, "step": 4169 }, { "epoch": 1.7350440683254036, "grad_norm": 1.7431789636611938, "learning_rate": 7.940461977521288e-06, "loss": 0.7946, "step": 4170 }, { "epoch": 1.7354600525180044, "grad_norm": 1.725671648979187, "learning_rate": 7.936077925537559e-06, "loss": 0.8479, "step": 4171 }, { "epoch": 1.735876036710605, "grad_norm": 1.8029543161392212, "learning_rate": 7.931694287848376e-06, "loss": 0.7934, "step": 4172 }, { "epoch": 1.7362920209032056, "grad_norm": 1.7348307371139526, "learning_rate": 7.92731106533368e-06, "loss": 0.7937, "step": 4173 }, { "epoch": 1.7367080050958064, "grad_norm": 1.695665717124939, "learning_rate": 7.922928258873316e-06, "loss": 0.7903, "step": 4174 }, { "epoch": 1.7371239892884072, "grad_norm": 1.7917371988296509, "learning_rate": 7.91854586934706e-06, "loss": 0.8422, "step": 4175 }, { "epoch": 1.7375399734810077, "grad_norm": 1.6707991361618042, "learning_rate": 7.914163897634587e-06, "loss": 0.7667, "step": 4176 }, { "epoch": 1.7379559576736083, "grad_norm": 1.7566660642623901, "learning_rate": 7.909782344615508e-06, "loss": 0.8774, "step": 4177 }, { "epoch": 1.7383719418662091, "grad_norm": 1.815736174583435, "learning_rate": 7.905401211169332e-06, "loss": 0.8834, "step": 4178 }, { "epoch": 1.7387879260588097, "grad_norm": 1.7617627382278442, "learning_rate": 7.901020498175495e-06, "loss": 0.855, "step": 4179 }, { "epoch": 1.7392039102514105, "grad_norm": 1.7277528047561646, "learning_rate": 7.89664020651334e-06, "loss": 0.7877, "step": 4180 }, { "epoch": 1.739619894444011, "grad_norm": 2.0247347354888916, "learning_rate": 7.892260337062133e-06, "loss": 0.9946, "step": 4181 }, { "epoch": 1.7400358786366117, "grad_norm": 1.6109681129455566, "learning_rate": 7.887880890701058e-06, "loss": 0.6928, "step": 4182 }, { "epoch": 1.7404518628292125, "grad_norm": 1.5707571506500244, "learning_rate": 7.883501868309203e-06, "loss": 0.7978, "step": 4183 }, { "epoch": 1.7408678470218133, "grad_norm": 1.7387794256210327, "learning_rate": 7.879123270765578e-06, "loss": 0.8275, "step": 4184 }, { "epoch": 1.7412838312144139, "grad_norm": 1.7145860195159912, "learning_rate": 7.874745098949102e-06, "loss": 0.8302, "step": 4185 }, { "epoch": 1.7416998154070145, "grad_norm": 1.7769238948822021, "learning_rate": 7.870367353738621e-06, "loss": 0.8188, "step": 4186 }, { "epoch": 1.7421157995996153, "grad_norm": 1.7821736335754395, "learning_rate": 7.86599003601288e-06, "loss": 0.9161, "step": 4187 }, { "epoch": 1.7425317837922158, "grad_norm": 1.5278620719909668, "learning_rate": 7.861613146650551e-06, "loss": 0.8098, "step": 4188 }, { "epoch": 1.7429477679848167, "grad_norm": 161.04571533203125, "learning_rate": 7.857236686530206e-06, "loss": 0.8273, "step": 4189 }, { "epoch": 1.7433637521774172, "grad_norm": 1.913909912109375, "learning_rate": 7.85286065653035e-06, "loss": 1.0035, "step": 4190 }, { "epoch": 1.7437797363700178, "grad_norm": 1.8095390796661377, "learning_rate": 7.84848505752938e-06, "loss": 0.9685, "step": 4191 }, { "epoch": 1.7441957205626186, "grad_norm": 1.9502474069595337, "learning_rate": 7.844109890405628e-06, "loss": 0.8058, "step": 4192 }, { "epoch": 1.7446117047552194, "grad_norm": 1.8759719133377075, "learning_rate": 7.83973515603732e-06, "loss": 0.8853, "step": 4193 }, { "epoch": 1.74502768894782, "grad_norm": 1.7706632614135742, "learning_rate": 7.835360855302611e-06, "loss": 0.8426, "step": 4194 }, { "epoch": 1.7454436731404206, "grad_norm": 1.8306742906570435, "learning_rate": 7.830986989079554e-06, "loss": 0.8489, "step": 4195 }, { "epoch": 1.7458596573330214, "grad_norm": 1.7471855878829956, "learning_rate": 7.826613558246128e-06, "loss": 0.7932, "step": 4196 }, { "epoch": 1.746275641525622, "grad_norm": 30.916513442993164, "learning_rate": 7.822240563680217e-06, "loss": 0.8649, "step": 4197 }, { "epoch": 1.7466916257182228, "grad_norm": 2.0604710578918457, "learning_rate": 7.817868006259622e-06, "loss": 0.9118, "step": 4198 }, { "epoch": 1.7471076099108234, "grad_norm": 1.9412673711776733, "learning_rate": 7.813495886862053e-06, "loss": 0.9974, "step": 4199 }, { "epoch": 1.747523594103424, "grad_norm": 1.6954859495162964, "learning_rate": 7.809124206365126e-06, "loss": 0.759, "step": 4200 }, { "epoch": 1.7479395782960248, "grad_norm": 1.8282065391540527, "learning_rate": 7.804752965646388e-06, "loss": 0.8756, "step": 4201 }, { "epoch": 1.7483555624886256, "grad_norm": 6.318904399871826, "learning_rate": 7.800382165583277e-06, "loss": 0.8083, "step": 4202 }, { "epoch": 1.7487715466812261, "grad_norm": 1.6819814443588257, "learning_rate": 7.796011807053154e-06, "loss": 0.7958, "step": 4203 }, { "epoch": 1.7491875308738267, "grad_norm": 1.9598684310913086, "learning_rate": 7.791641890933285e-06, "loss": 0.9522, "step": 4204 }, { "epoch": 1.7496035150664275, "grad_norm": 1.551513910293579, "learning_rate": 7.787272418100856e-06, "loss": 0.7623, "step": 4205 }, { "epoch": 1.750019499259028, "grad_norm": 1.5726362466812134, "learning_rate": 7.782903389432953e-06, "loss": 0.7547, "step": 4206 }, { "epoch": 1.750435483451629, "grad_norm": 1.8177706003189087, "learning_rate": 7.778534805806586e-06, "loss": 0.8484, "step": 4207 }, { "epoch": 1.7508514676442295, "grad_norm": 1.7266416549682617, "learning_rate": 7.77416666809866e-06, "loss": 0.8353, "step": 4208 }, { "epoch": 1.75126745183683, "grad_norm": 1.7467657327651978, "learning_rate": 7.769798977186003e-06, "loss": 0.8395, "step": 4209 }, { "epoch": 1.7516834360294309, "grad_norm": 1.8194797039031982, "learning_rate": 7.765431733945346e-06, "loss": 0.8337, "step": 4210 }, { "epoch": 1.7520994202220317, "grad_norm": 2.6264700889587402, "learning_rate": 7.761064939253337e-06, "loss": 0.7929, "step": 4211 }, { "epoch": 1.7525154044146323, "grad_norm": 1.6639246940612793, "learning_rate": 7.756698593986531e-06, "loss": 0.7503, "step": 4212 }, { "epoch": 1.7529313886072329, "grad_norm": 2.8604183197021484, "learning_rate": 7.752332699021383e-06, "loss": 0.8341, "step": 4213 }, { "epoch": 1.7533473727998337, "grad_norm": 1.7923029661178589, "learning_rate": 7.747967255234277e-06, "loss": 0.8647, "step": 4214 }, { "epoch": 1.7537633569924342, "grad_norm": 1.6214078664779663, "learning_rate": 7.743602263501488e-06, "loss": 0.8082, "step": 4215 }, { "epoch": 1.754179341185035, "grad_norm": 2.0136353969573975, "learning_rate": 7.739237724699214e-06, "loss": 0.91, "step": 4216 }, { "epoch": 1.7545953253776356, "grad_norm": 1.7837265729904175, "learning_rate": 7.734873639703553e-06, "loss": 0.8731, "step": 4217 }, { "epoch": 1.7550113095702362, "grad_norm": 1.856164574623108, "learning_rate": 7.730510009390518e-06, "loss": 0.8785, "step": 4218 }, { "epoch": 1.755427293762837, "grad_norm": 1.73092520236969, "learning_rate": 7.726146834636022e-06, "loss": 0.9012, "step": 4219 }, { "epoch": 1.7558432779554378, "grad_norm": 36.267372131347656, "learning_rate": 7.721784116315898e-06, "loss": 0.6862, "step": 4220 }, { "epoch": 1.7562592621480384, "grad_norm": 1.7811485528945923, "learning_rate": 7.717421855305878e-06, "loss": 0.8643, "step": 4221 }, { "epoch": 1.756675246340639, "grad_norm": 1.9545940160751343, "learning_rate": 7.71306005248161e-06, "loss": 0.8727, "step": 4222 }, { "epoch": 1.7570912305332398, "grad_norm": 1.7347687482833862, "learning_rate": 7.708698708718643e-06, "loss": 0.8155, "step": 4223 }, { "epoch": 1.7575072147258404, "grad_norm": 1.787131905555725, "learning_rate": 7.704337824892439e-06, "loss": 0.756, "step": 4224 }, { "epoch": 1.7579231989184412, "grad_norm": 1.7308499813079834, "learning_rate": 7.699977401878361e-06, "loss": 0.7687, "step": 4225 }, { "epoch": 1.7583391831110418, "grad_norm": 1.8629049062728882, "learning_rate": 7.695617440551688e-06, "loss": 0.8935, "step": 4226 }, { "epoch": 1.7587551673036423, "grad_norm": 1.7543327808380127, "learning_rate": 7.691257941787605e-06, "loss": 0.8526, "step": 4227 }, { "epoch": 1.7591711514962431, "grad_norm": 1.7325416803359985, "learning_rate": 7.686898906461194e-06, "loss": 0.9158, "step": 4228 }, { "epoch": 1.759587135688844, "grad_norm": 1.8178397417068481, "learning_rate": 7.682540335447459e-06, "loss": 0.9405, "step": 4229 }, { "epoch": 1.7600031198814445, "grad_norm": 1.7299578189849854, "learning_rate": 7.678182229621297e-06, "loss": 0.86, "step": 4230 }, { "epoch": 1.7604191040740451, "grad_norm": 1.769973635673523, "learning_rate": 7.673824589857524e-06, "loss": 0.8563, "step": 4231 }, { "epoch": 1.760835088266646, "grad_norm": 1.7617281675338745, "learning_rate": 7.669467417030852e-06, "loss": 0.7757, "step": 4232 }, { "epoch": 1.7612510724592465, "grad_norm": 1.7173004150390625, "learning_rate": 7.665110712015907e-06, "loss": 0.7313, "step": 4233 }, { "epoch": 1.7616670566518473, "grad_norm": 1.9078433513641357, "learning_rate": 7.66075447568721e-06, "loss": 0.8685, "step": 4234 }, { "epoch": 1.762083040844448, "grad_norm": 1.6469589471817017, "learning_rate": 7.656398708919205e-06, "loss": 0.8169, "step": 4235 }, { "epoch": 1.7624990250370485, "grad_norm": 1.7079402208328247, "learning_rate": 7.652043412586228e-06, "loss": 0.8496, "step": 4236 }, { "epoch": 1.7629150092296493, "grad_norm": 1.6402496099472046, "learning_rate": 7.647688587562527e-06, "loss": 0.8341, "step": 4237 }, { "epoch": 1.76333099342225, "grad_norm": 1.7456082105636597, "learning_rate": 7.64333423472225e-06, "loss": 0.7875, "step": 4238 }, { "epoch": 1.7637469776148507, "grad_norm": 1.863324761390686, "learning_rate": 7.638980354939458e-06, "loss": 0.8425, "step": 4239 }, { "epoch": 1.7641629618074512, "grad_norm": 8.207806587219238, "learning_rate": 7.634626949088106e-06, "loss": 0.854, "step": 4240 }, { "epoch": 1.764578946000052, "grad_norm": 1.7684991359710693, "learning_rate": 7.630274018042067e-06, "loss": 0.8395, "step": 4241 }, { "epoch": 1.7649949301926526, "grad_norm": 1.8653401136398315, "learning_rate": 7.62592156267511e-06, "loss": 0.8378, "step": 4242 }, { "epoch": 1.7654109143852534, "grad_norm": 1.7656526565551758, "learning_rate": 7.6215695838609085e-06, "loss": 0.867, "step": 4243 }, { "epoch": 1.765826898577854, "grad_norm": 1.6997495889663696, "learning_rate": 7.617218082473048e-06, "loss": 0.8756, "step": 4244 }, { "epoch": 1.7662428827704546, "grad_norm": 1.8838231563568115, "learning_rate": 7.612867059385004e-06, "loss": 0.9082, "step": 4245 }, { "epoch": 1.7666588669630554, "grad_norm": 1.9222170114517212, "learning_rate": 7.608516515470174e-06, "loss": 0.8901, "step": 4246 }, { "epoch": 1.7670748511556562, "grad_norm": 1.8487285375595093, "learning_rate": 7.604166451601845e-06, "loss": 0.7844, "step": 4247 }, { "epoch": 1.7674908353482568, "grad_norm": 54.06987762451172, "learning_rate": 7.599816868653216e-06, "loss": 0.8484, "step": 4248 }, { "epoch": 1.7679068195408574, "grad_norm": 8.976716995239258, "learning_rate": 7.595467767497379e-06, "loss": 0.8516, "step": 4249 }, { "epoch": 1.7683228037334582, "grad_norm": 1.6064163446426392, "learning_rate": 7.5911191490073446e-06, "loss": 0.8141, "step": 4250 }, { "epoch": 1.7687387879260588, "grad_norm": 1.772729516029358, "learning_rate": 7.586771014056013e-06, "loss": 0.8028, "step": 4251 }, { "epoch": 1.7691547721186596, "grad_norm": 1.8071147203445435, "learning_rate": 7.582423363516196e-06, "loss": 0.9135, "step": 4252 }, { "epoch": 1.7695707563112602, "grad_norm": 1.8968968391418457, "learning_rate": 7.578076198260598e-06, "loss": 0.8586, "step": 4253 }, { "epoch": 1.7699867405038607, "grad_norm": 1.7787644863128662, "learning_rate": 7.5737295191618435e-06, "loss": 0.8543, "step": 4254 }, { "epoch": 1.7704027246964615, "grad_norm": 1.756638765335083, "learning_rate": 7.5693833270924385e-06, "loss": 0.8415, "step": 4255 }, { "epoch": 1.7708187088890623, "grad_norm": 1.629860520362854, "learning_rate": 7.565037622924808e-06, "loss": 0.8021, "step": 4256 }, { "epoch": 1.771234693081663, "grad_norm": 50.16199493408203, "learning_rate": 7.560692407531273e-06, "loss": 0.8771, "step": 4257 }, { "epoch": 1.7716506772742635, "grad_norm": 1.80641508102417, "learning_rate": 7.556347681784049e-06, "loss": 0.7083, "step": 4258 }, { "epoch": 1.7720666614668643, "grad_norm": 82.59822845458984, "learning_rate": 7.552003446555268e-06, "loss": 0.8043, "step": 4259 }, { "epoch": 1.772482645659465, "grad_norm": 1.9409046173095703, "learning_rate": 7.547659702716949e-06, "loss": 0.7761, "step": 4260 }, { "epoch": 1.7728986298520657, "grad_norm": 1.733394742012024, "learning_rate": 7.5433164511410275e-06, "loss": 0.7189, "step": 4261 }, { "epoch": 1.7733146140446663, "grad_norm": 1.7656986713409424, "learning_rate": 7.538973692699323e-06, "loss": 0.8869, "step": 4262 }, { "epoch": 1.7737305982372669, "grad_norm": 1.6757888793945312, "learning_rate": 7.53463142826357e-06, "loss": 0.6377, "step": 4263 }, { "epoch": 1.7741465824298677, "grad_norm": 1.7857306003570557, "learning_rate": 7.530289658705392e-06, "loss": 0.8579, "step": 4264 }, { "epoch": 1.7745625666224685, "grad_norm": 1.7319986820220947, "learning_rate": 7.52594838489633e-06, "loss": 0.9164, "step": 4265 }, { "epoch": 1.774978550815069, "grad_norm": 1.8854272365570068, "learning_rate": 7.521607607707806e-06, "loss": 0.8327, "step": 4266 }, { "epoch": 1.7753945350076696, "grad_norm": 1.767299771308899, "learning_rate": 7.517267328011159e-06, "loss": 0.8589, "step": 4267 }, { "epoch": 1.7758105192002704, "grad_norm": 1.865480899810791, "learning_rate": 7.5129275466776105e-06, "loss": 0.9568, "step": 4268 }, { "epoch": 1.776226503392871, "grad_norm": 1.877968668937683, "learning_rate": 7.5085882645783035e-06, "loss": 0.819, "step": 4269 }, { "epoch": 1.7766424875854718, "grad_norm": 1.6534456014633179, "learning_rate": 7.50424948258426e-06, "loss": 0.7437, "step": 4270 }, { "epoch": 1.7770584717780724, "grad_norm": 3.9997379779815674, "learning_rate": 7.499911201566419e-06, "loss": 0.8947, "step": 4271 }, { "epoch": 1.777474455970673, "grad_norm": 1.8715204000473022, "learning_rate": 7.495573422395605e-06, "loss": 0.8851, "step": 4272 }, { "epoch": 1.7778904401632738, "grad_norm": 1.7702816724777222, "learning_rate": 7.491236145942548e-06, "loss": 0.8724, "step": 4273 }, { "epoch": 1.7783064243558746, "grad_norm": 1.703887701034546, "learning_rate": 7.486899373077884e-06, "loss": 0.8515, "step": 4274 }, { "epoch": 1.7787224085484752, "grad_norm": 1.7142771482467651, "learning_rate": 7.482563104672133e-06, "loss": 0.7718, "step": 4275 }, { "epoch": 1.7791383927410758, "grad_norm": 1.7670600414276123, "learning_rate": 7.478227341595723e-06, "loss": 0.6872, "step": 4276 }, { "epoch": 1.7795543769336766, "grad_norm": 1.7377057075500488, "learning_rate": 7.473892084718979e-06, "loss": 0.8239, "step": 4277 }, { "epoch": 1.7799703611262772, "grad_norm": 1.7210698127746582, "learning_rate": 7.469557334912128e-06, "loss": 0.7922, "step": 4278 }, { "epoch": 1.780386345318878, "grad_norm": 1.9174107313156128, "learning_rate": 7.4652230930452865e-06, "loss": 0.8211, "step": 4279 }, { "epoch": 1.7808023295114785, "grad_norm": 2.372054100036621, "learning_rate": 7.460889359988478e-06, "loss": 0.8477, "step": 4280 }, { "epoch": 1.7812183137040791, "grad_norm": 1.8233016729354858, "learning_rate": 7.456556136611618e-06, "loss": 0.8926, "step": 4281 }, { "epoch": 1.78163429789668, "grad_norm": 1.7352811098098755, "learning_rate": 7.452223423784524e-06, "loss": 0.8237, "step": 4282 }, { "epoch": 1.7820502820892807, "grad_norm": 2.002655029296875, "learning_rate": 7.447891222376902e-06, "loss": 0.8358, "step": 4283 }, { "epoch": 1.7824662662818813, "grad_norm": 20.94190216064453, "learning_rate": 7.443559533258373e-06, "loss": 0.906, "step": 4284 }, { "epoch": 1.782882250474482, "grad_norm": 1.7819209098815918, "learning_rate": 7.439228357298433e-06, "loss": 0.9475, "step": 4285 }, { "epoch": 1.7832982346670827, "grad_norm": 1877.8472900390625, "learning_rate": 7.4348976953664954e-06, "loss": 0.9155, "step": 4286 }, { "epoch": 1.7837142188596833, "grad_norm": 1.7582813501358032, "learning_rate": 7.430567548331855e-06, "loss": 0.9294, "step": 4287 }, { "epoch": 1.784130203052284, "grad_norm": 1.7937216758728027, "learning_rate": 7.426237917063711e-06, "loss": 0.8606, "step": 4288 }, { "epoch": 1.7845461872448847, "grad_norm": 1.8801378011703491, "learning_rate": 7.421908802431162e-06, "loss": 0.8221, "step": 4289 }, { "epoch": 1.7849621714374853, "grad_norm": 1.797755241394043, "learning_rate": 7.417580205303193e-06, "loss": 0.8525, "step": 4290 }, { "epoch": 1.785378155630086, "grad_norm": 15.79560661315918, "learning_rate": 7.413252126548695e-06, "loss": 0.7782, "step": 4291 }, { "epoch": 1.7857941398226869, "grad_norm": 1.5789333581924438, "learning_rate": 7.408924567036444e-06, "loss": 0.8435, "step": 4292 }, { "epoch": 1.7862101240152874, "grad_norm": 1.6696120500564575, "learning_rate": 7.404597527635124e-06, "loss": 0.8263, "step": 4293 }, { "epoch": 1.786626108207888, "grad_norm": 1.758567214012146, "learning_rate": 7.400271009213307e-06, "loss": 0.8018, "step": 4294 }, { "epoch": 1.7870420924004888, "grad_norm": 1.669158697128296, "learning_rate": 7.395945012639464e-06, "loss": 0.7987, "step": 4295 }, { "epoch": 1.7874580765930894, "grad_norm": 1.9988754987716675, "learning_rate": 7.391619538781957e-06, "loss": 0.9091, "step": 4296 }, { "epoch": 1.7878740607856902, "grad_norm": 1.6898517608642578, "learning_rate": 7.387294588509049e-06, "loss": 0.8419, "step": 4297 }, { "epoch": 1.7882900449782908, "grad_norm": 1.7845275402069092, "learning_rate": 7.382970162688887e-06, "loss": 0.8108, "step": 4298 }, { "epoch": 1.7887060291708914, "grad_norm": 1.8007062673568726, "learning_rate": 7.378646262189531e-06, "loss": 0.8136, "step": 4299 }, { "epoch": 1.7891220133634922, "grad_norm": 1.6189810037612915, "learning_rate": 7.374322887878913e-06, "loss": 0.8229, "step": 4300 }, { "epoch": 1.789537997556093, "grad_norm": 1.6826568841934204, "learning_rate": 7.370000040624884e-06, "loss": 0.7786, "step": 4301 }, { "epoch": 1.7899539817486936, "grad_norm": 1.6191834211349487, "learning_rate": 7.365677721295168e-06, "loss": 0.7208, "step": 4302 }, { "epoch": 1.7903699659412942, "grad_norm": 1.628928542137146, "learning_rate": 7.36135593075739e-06, "loss": 0.7861, "step": 4303 }, { "epoch": 1.790785950133895, "grad_norm": 1.8143408298492432, "learning_rate": 7.357034669879077e-06, "loss": 0.8725, "step": 4304 }, { "epoch": 1.7912019343264955, "grad_norm": 1.6118227243423462, "learning_rate": 7.35271393952764e-06, "loss": 0.8221, "step": 4305 }, { "epoch": 1.7916179185190964, "grad_norm": 1.8944424390792847, "learning_rate": 7.348393740570387e-06, "loss": 0.7534, "step": 4306 }, { "epoch": 1.792033902711697, "grad_norm": 1.84224271774292, "learning_rate": 7.344074073874514e-06, "loss": 0.7881, "step": 4307 }, { "epoch": 1.7924498869042975, "grad_norm": 1.7207305431365967, "learning_rate": 7.339754940307123e-06, "loss": 0.7811, "step": 4308 }, { "epoch": 1.7928658710968983, "grad_norm": 1.819828748703003, "learning_rate": 7.335436340735193e-06, "loss": 0.888, "step": 4309 }, { "epoch": 1.7932818552894991, "grad_norm": 1.726604700088501, "learning_rate": 7.331118276025611e-06, "loss": 0.782, "step": 4310 }, { "epoch": 1.7936978394820997, "grad_norm": 1.8304930925369263, "learning_rate": 7.3268007470451445e-06, "loss": 0.7675, "step": 4311 }, { "epoch": 1.7941138236747003, "grad_norm": 1.8414466381072998, "learning_rate": 7.322483754660461e-06, "loss": 0.8376, "step": 4312 }, { "epoch": 1.794529807867301, "grad_norm": 1.5683023929595947, "learning_rate": 7.3181672997381136e-06, "loss": 0.6962, "step": 4313 }, { "epoch": 1.7949457920599017, "grad_norm": 307.319580078125, "learning_rate": 7.313851383144558e-06, "loss": 0.7712, "step": 4314 }, { "epoch": 1.7953617762525025, "grad_norm": 1.7061030864715576, "learning_rate": 7.309536005746128e-06, "loss": 0.7304, "step": 4315 }, { "epoch": 1.795777760445103, "grad_norm": 1.8391321897506714, "learning_rate": 7.305221168409063e-06, "loss": 0.9199, "step": 4316 }, { "epoch": 1.7961937446377036, "grad_norm": 1.7419705390930176, "learning_rate": 7.300906871999484e-06, "loss": 0.8404, "step": 4317 }, { "epoch": 1.7966097288303045, "grad_norm": 1.5752440690994263, "learning_rate": 7.296593117383406e-06, "loss": 0.8145, "step": 4318 }, { "epoch": 1.7970257130229053, "grad_norm": 1.963855504989624, "learning_rate": 7.292279905426743e-06, "loss": 0.9275, "step": 4319 }, { "epoch": 1.7974416972155058, "grad_norm": 1.7620195150375366, "learning_rate": 7.287967236995286e-06, "loss": 0.8222, "step": 4320 }, { "epoch": 1.7978576814081064, "grad_norm": 1.7480653524398804, "learning_rate": 7.283655112954727e-06, "loss": 0.8767, "step": 4321 }, { "epoch": 1.7982736656007072, "grad_norm": 1.8674240112304688, "learning_rate": 7.279343534170642e-06, "loss": 0.8684, "step": 4322 }, { "epoch": 1.7986896497933078, "grad_norm": 2.488511800765991, "learning_rate": 7.27503250150851e-06, "loss": 0.885, "step": 4323 }, { "epoch": 1.7991056339859086, "grad_norm": 1.670782446861267, "learning_rate": 7.2707220158336825e-06, "loss": 0.7063, "step": 4324 }, { "epoch": 1.7995216181785092, "grad_norm": 2.301187515258789, "learning_rate": 7.266412078011419e-06, "loss": 0.9076, "step": 4325 }, { "epoch": 1.7999376023711098, "grad_norm": 5.987264156341553, "learning_rate": 7.262102688906854e-06, "loss": 0.8594, "step": 4326 }, { "epoch": 1.8003535865637106, "grad_norm": 1.9549881219863892, "learning_rate": 7.257793849385024e-06, "loss": 0.9075, "step": 4327 }, { "epoch": 1.8007695707563114, "grad_norm": 1.7651453018188477, "learning_rate": 7.25348556031084e-06, "loss": 0.762, "step": 4328 }, { "epoch": 1.801185554948912, "grad_norm": 1.7896603345870972, "learning_rate": 7.249177822549125e-06, "loss": 0.849, "step": 4329 }, { "epoch": 1.8016015391415126, "grad_norm": 1.5888559818267822, "learning_rate": 7.2448706369645694e-06, "loss": 0.8444, "step": 4330 }, { "epoch": 1.8020175233341134, "grad_norm": 1.6749192476272583, "learning_rate": 7.2405640044217666e-06, "loss": 0.7354, "step": 4331 }, { "epoch": 1.802433507526714, "grad_norm": 1.8955645561218262, "learning_rate": 7.236257925785186e-06, "loss": 0.9428, "step": 4332 }, { "epoch": 1.8028494917193147, "grad_norm": 1.9221868515014648, "learning_rate": 7.231952401919202e-06, "loss": 0.7249, "step": 4333 }, { "epoch": 1.8032654759119153, "grad_norm": 7.013227462768555, "learning_rate": 7.227647433688071e-06, "loss": 0.7824, "step": 4334 }, { "epoch": 1.803681460104516, "grad_norm": 1.8921656608581543, "learning_rate": 7.223343021955931e-06, "loss": 0.7263, "step": 4335 }, { "epoch": 1.8040974442971167, "grad_norm": 1.9938517808914185, "learning_rate": 7.219039167586819e-06, "loss": 0.8601, "step": 4336 }, { "epoch": 1.8045134284897175, "grad_norm": 1.834831714630127, "learning_rate": 7.214735871444647e-06, "loss": 0.826, "step": 4337 }, { "epoch": 1.804929412682318, "grad_norm": 1.577713131904602, "learning_rate": 7.210433134393232e-06, "loss": 0.7691, "step": 4338 }, { "epoch": 1.8053453968749187, "grad_norm": 1.9127203226089478, "learning_rate": 7.2061309572962615e-06, "loss": 0.7903, "step": 4339 }, { "epoch": 1.8057613810675195, "grad_norm": 1.7546720504760742, "learning_rate": 7.201829341017326e-06, "loss": 0.7927, "step": 4340 }, { "epoch": 1.80617736526012, "grad_norm": 1.954454779624939, "learning_rate": 7.1975282864198925e-06, "loss": 0.8326, "step": 4341 }, { "epoch": 1.8065933494527209, "grad_norm": 261.6630554199219, "learning_rate": 7.193227794367319e-06, "loss": 0.7304, "step": 4342 }, { "epoch": 1.8070093336453215, "grad_norm": 1.9545284509658813, "learning_rate": 7.188927865722848e-06, "loss": 0.836, "step": 4343 }, { "epoch": 1.807425317837922, "grad_norm": 1.933269739151001, "learning_rate": 7.1846285013496185e-06, "loss": 0.803, "step": 4344 }, { "epoch": 1.8078413020305228, "grad_norm": 1.6528619527816772, "learning_rate": 7.180329702110642e-06, "loss": 0.7924, "step": 4345 }, { "epoch": 1.8082572862231236, "grad_norm": 1.8467875719070435, "learning_rate": 7.1760314688688294e-06, "loss": 0.7506, "step": 4346 }, { "epoch": 1.8086732704157242, "grad_norm": 1.7400349378585815, "learning_rate": 7.171733802486964e-06, "loss": 0.8212, "step": 4347 }, { "epoch": 1.8090892546083248, "grad_norm": 1.9251269102096558, "learning_rate": 7.167436703827731e-06, "loss": 0.8503, "step": 4348 }, { "epoch": 1.8095052388009256, "grad_norm": 2.112067461013794, "learning_rate": 7.163140173753695e-06, "loss": 0.7906, "step": 4349 }, { "epoch": 1.8099212229935262, "grad_norm": 1.7693017721176147, "learning_rate": 7.158844213127301e-06, "loss": 0.7699, "step": 4350 }, { "epoch": 1.810337207186127, "grad_norm": 1.6948834657669067, "learning_rate": 7.1545488228108885e-06, "loss": 0.7942, "step": 4351 }, { "epoch": 1.8107531913787276, "grad_norm": 1.7469571828842163, "learning_rate": 7.150254003666673e-06, "loss": 0.8803, "step": 4352 }, { "epoch": 1.8111691755713282, "grad_norm": 10.506972312927246, "learning_rate": 7.145959756556767e-06, "loss": 0.9383, "step": 4353 }, { "epoch": 1.811585159763929, "grad_norm": 1.8420698642730713, "learning_rate": 7.141666082343157e-06, "loss": 0.7755, "step": 4354 }, { "epoch": 1.8120011439565298, "grad_norm": 1.830531358718872, "learning_rate": 7.13737298188772e-06, "loss": 0.8399, "step": 4355 }, { "epoch": 1.8124171281491304, "grad_norm": 1.7766133546829224, "learning_rate": 7.133080456052222e-06, "loss": 0.8103, "step": 4356 }, { "epoch": 1.812833112341731, "grad_norm": 1.836411476135254, "learning_rate": 7.128788505698305e-06, "loss": 0.9216, "step": 4357 }, { "epoch": 1.8132490965343318, "grad_norm": 1.6438714265823364, "learning_rate": 7.124497131687498e-06, "loss": 0.7446, "step": 4358 }, { "epoch": 1.8136650807269323, "grad_norm": 5.0827531814575195, "learning_rate": 7.12020633488122e-06, "loss": 0.8488, "step": 4359 }, { "epoch": 1.8140810649195331, "grad_norm": 1.7818903923034668, "learning_rate": 7.115916116140766e-06, "loss": 0.9108, "step": 4360 }, { "epoch": 1.8144970491121337, "grad_norm": 1.7906599044799805, "learning_rate": 7.111626476327324e-06, "loss": 0.8502, "step": 4361 }, { "epoch": 1.8149130333047343, "grad_norm": 1.9271456003189087, "learning_rate": 7.107337416301952e-06, "loss": 0.8384, "step": 4362 }, { "epoch": 1.815329017497335, "grad_norm": 1.8468081951141357, "learning_rate": 7.103048936925605e-06, "loss": 0.8913, "step": 4363 }, { "epoch": 1.815745001689936, "grad_norm": 1.7585303783416748, "learning_rate": 7.098761039059121e-06, "loss": 0.8312, "step": 4364 }, { "epoch": 1.8161609858825365, "grad_norm": 1.7728523015975952, "learning_rate": 7.094473723563211e-06, "loss": 0.7345, "step": 4365 }, { "epoch": 1.816576970075137, "grad_norm": 1.713564395904541, "learning_rate": 7.09018699129848e-06, "loss": 0.8207, "step": 4366 }, { "epoch": 1.8169929542677379, "grad_norm": 1.8366279602050781, "learning_rate": 7.0859008431254015e-06, "loss": 0.7745, "step": 4367 }, { "epoch": 1.8174089384603385, "grad_norm": 1.6977037191390991, "learning_rate": 7.081615279904354e-06, "loss": 0.7386, "step": 4368 }, { "epoch": 1.8178249226529393, "grad_norm": 1.6723259687423706, "learning_rate": 7.077330302495578e-06, "loss": 0.763, "step": 4369 }, { "epoch": 1.8182409068455399, "grad_norm": 1.6542778015136719, "learning_rate": 7.073045911759205e-06, "loss": 0.9405, "step": 4370 }, { "epoch": 1.8186568910381404, "grad_norm": 1.8930925130844116, "learning_rate": 7.068762108555247e-06, "loss": 0.8526, "step": 4371 }, { "epoch": 1.8190728752307412, "grad_norm": 1.8025898933410645, "learning_rate": 7.064478893743604e-06, "loss": 0.8479, "step": 4372 }, { "epoch": 1.819488859423342, "grad_norm": 1.7735077142715454, "learning_rate": 7.060196268184045e-06, "loss": 0.8195, "step": 4373 }, { "epoch": 1.8199048436159426, "grad_norm": 1.8342616558074951, "learning_rate": 7.055914232736238e-06, "loss": 0.7395, "step": 4374 }, { "epoch": 1.8203208278085432, "grad_norm": 1.777408480644226, "learning_rate": 7.051632788259716e-06, "loss": 0.738, "step": 4375 }, { "epoch": 1.820736812001144, "grad_norm": 1.6977208852767944, "learning_rate": 7.047351935613905e-06, "loss": 0.8197, "step": 4376 }, { "epoch": 1.8211527961937446, "grad_norm": 1.8202592134475708, "learning_rate": 7.043071675658102e-06, "loss": 0.7885, "step": 4377 }, { "epoch": 1.8215687803863454, "grad_norm": 24.425546646118164, "learning_rate": 7.038792009251494e-06, "loss": 0.7589, "step": 4378 }, { "epoch": 1.821984764578946, "grad_norm": 1.7861244678497314, "learning_rate": 7.034512937253151e-06, "loss": 0.83, "step": 4379 }, { "epoch": 1.8224007487715466, "grad_norm": 1.7628099918365479, "learning_rate": 7.0302344605220104e-06, "loss": 0.7939, "step": 4380 }, { "epoch": 1.8228167329641474, "grad_norm": 1.7876603603363037, "learning_rate": 7.025956579916904e-06, "loss": 0.7681, "step": 4381 }, { "epoch": 1.8232327171567482, "grad_norm": 1.776249647140503, "learning_rate": 7.0216792962965305e-06, "loss": 0.734, "step": 4382 }, { "epoch": 1.8236487013493488, "grad_norm": 1.9032411575317383, "learning_rate": 7.017402610519486e-06, "loss": 1.0232, "step": 4383 }, { "epoch": 1.8240646855419493, "grad_norm": 1.7259764671325684, "learning_rate": 7.013126523444227e-06, "loss": 0.8623, "step": 4384 }, { "epoch": 1.8244806697345501, "grad_norm": 926.1299438476562, "learning_rate": 7.008851035929108e-06, "loss": 0.8497, "step": 4385 }, { "epoch": 1.8248966539271507, "grad_norm": 1.7099920511245728, "learning_rate": 7.004576148832347e-06, "loss": 0.7517, "step": 4386 }, { "epoch": 1.8253126381197515, "grad_norm": 1.779008388519287, "learning_rate": 7.000301863012058e-06, "loss": 0.8509, "step": 4387 }, { "epoch": 1.8257286223123521, "grad_norm": 2.446597099304199, "learning_rate": 6.9960281793262155e-06, "loss": 0.8853, "step": 4388 }, { "epoch": 1.8261446065049527, "grad_norm": 1.7804690599441528, "learning_rate": 6.991755098632693e-06, "loss": 0.7586, "step": 4389 }, { "epoch": 1.8265605906975535, "grad_norm": 123.91058349609375, "learning_rate": 6.987482621789228e-06, "loss": 0.8559, "step": 4390 }, { "epoch": 1.8269765748901543, "grad_norm": 1.6587538719177246, "learning_rate": 6.983210749653444e-06, "loss": 0.8635, "step": 4391 }, { "epoch": 1.8273925590827549, "grad_norm": 1.8519984483718872, "learning_rate": 6.978939483082836e-06, "loss": 0.8972, "step": 4392 }, { "epoch": 1.8278085432753555, "grad_norm": 1.80805504322052, "learning_rate": 6.974668822934787e-06, "loss": 0.8389, "step": 4393 }, { "epoch": 1.8282245274679563, "grad_norm": 41.04240036010742, "learning_rate": 6.970398770066558e-06, "loss": 0.653, "step": 4394 }, { "epoch": 1.8286405116605569, "grad_norm": 1.6518995761871338, "learning_rate": 6.966129325335271e-06, "loss": 0.8274, "step": 4395 }, { "epoch": 1.8290564958531577, "grad_norm": 1.8516199588775635, "learning_rate": 6.961860489597954e-06, "loss": 0.7682, "step": 4396 }, { "epoch": 1.8294724800457582, "grad_norm": 1.7986531257629395, "learning_rate": 6.957592263711485e-06, "loss": 0.8549, "step": 4397 }, { "epoch": 1.8298884642383588, "grad_norm": 2.8801681995391846, "learning_rate": 6.9533246485326436e-06, "loss": 0.7704, "step": 4398 }, { "epoch": 1.8303044484309596, "grad_norm": 18.588722229003906, "learning_rate": 6.949057644918066e-06, "loss": 0.8466, "step": 4399 }, { "epoch": 1.8307204326235604, "grad_norm": 1.671314001083374, "learning_rate": 6.944791253724281e-06, "loss": 0.7547, "step": 4400 }, { "epoch": 1.831136416816161, "grad_norm": 1.773635983467102, "learning_rate": 6.94052547580768e-06, "loss": 0.7903, "step": 4401 }, { "epoch": 1.8315524010087616, "grad_norm": 1.636931300163269, "learning_rate": 6.936260312024552e-06, "loss": 0.9211, "step": 4402 }, { "epoch": 1.8319683852013624, "grad_norm": 1.7824163436889648, "learning_rate": 6.931995763231038e-06, "loss": 0.9184, "step": 4403 }, { "epoch": 1.832384369393963, "grad_norm": 1.672540307044983, "learning_rate": 6.9277318302831794e-06, "loss": 0.6919, "step": 4404 }, { "epoch": 1.8328003535865638, "grad_norm": 1.9096238613128662, "learning_rate": 6.923468514036876e-06, "loss": 0.765, "step": 4405 }, { "epoch": 1.8332163377791644, "grad_norm": 1.7073420286178589, "learning_rate": 6.919205815347914e-06, "loss": 0.7883, "step": 4406 }, { "epoch": 1.833632321971765, "grad_norm": 1.8299734592437744, "learning_rate": 6.914943735071945e-06, "loss": 0.7184, "step": 4407 }, { "epoch": 1.8340483061643658, "grad_norm": 1.7565068006515503, "learning_rate": 6.910682274064513e-06, "loss": 0.8401, "step": 4408 }, { "epoch": 1.8344642903569666, "grad_norm": 103.30394744873047, "learning_rate": 6.906421433181021e-06, "loss": 0.8775, "step": 4409 }, { "epoch": 1.8348802745495671, "grad_norm": 1.766397476196289, "learning_rate": 6.9021612132767566e-06, "loss": 0.8872, "step": 4410 }, { "epoch": 1.8352962587421677, "grad_norm": 2.107367753982544, "learning_rate": 6.897901615206885e-06, "loss": 0.8574, "step": 4411 }, { "epoch": 1.8357122429347683, "grad_norm": 1.7807772159576416, "learning_rate": 6.893642639826435e-06, "loss": 0.8526, "step": 4412 }, { "epoch": 1.8361282271273691, "grad_norm": 3.127312183380127, "learning_rate": 6.889384287990327e-06, "loss": 0.7436, "step": 4413 }, { "epoch": 1.83654421131997, "grad_norm": 4.796069622039795, "learning_rate": 6.8851265605533405e-06, "loss": 0.7073, "step": 4414 }, { "epoch": 1.8369601955125705, "grad_norm": 1.5823851823806763, "learning_rate": 6.880869458370139e-06, "loss": 0.7411, "step": 4415 }, { "epoch": 1.837376179705171, "grad_norm": 1.7976198196411133, "learning_rate": 6.876612982295254e-06, "loss": 0.7561, "step": 4416 }, { "epoch": 1.837792163897772, "grad_norm": 1.831662893295288, "learning_rate": 6.8723571331831004e-06, "loss": 0.8142, "step": 4417 }, { "epoch": 1.8382081480903727, "grad_norm": 1.7823721170425415, "learning_rate": 6.868101911887957e-06, "loss": 0.8666, "step": 4418 }, { "epoch": 1.8386241322829733, "grad_norm": 1.77888822555542, "learning_rate": 6.863847319263988e-06, "loss": 0.793, "step": 4419 }, { "epoch": 1.8390401164755739, "grad_norm": 1.94591224193573, "learning_rate": 6.859593356165218e-06, "loss": 0.8522, "step": 4420 }, { "epoch": 1.8394561006681744, "grad_norm": 1.7196204662322998, "learning_rate": 6.855340023445558e-06, "loss": 0.8929, "step": 4421 }, { "epoch": 1.8398720848607752, "grad_norm": 1.6461094617843628, "learning_rate": 6.851087321958779e-06, "loss": 0.8895, "step": 4422 }, { "epoch": 1.840288069053376, "grad_norm": 1.8226176500320435, "learning_rate": 6.846835252558543e-06, "loss": 0.7489, "step": 4423 }, { "epoch": 1.8407040532459766, "grad_norm": 1.6538770198822021, "learning_rate": 6.842583816098367e-06, "loss": 0.8177, "step": 4424 }, { "epoch": 1.8411200374385772, "grad_norm": 1.7015849351882935, "learning_rate": 6.8383330134316484e-06, "loss": 0.7256, "step": 4425 }, { "epoch": 1.841536021631178, "grad_norm": 1.9463907480239868, "learning_rate": 6.834082845411667e-06, "loss": 0.9333, "step": 4426 }, { "epoch": 1.8419520058237788, "grad_norm": 1.7032231092453003, "learning_rate": 6.829833312891556e-06, "loss": 0.8369, "step": 4427 }, { "epoch": 1.8423679900163794, "grad_norm": 1.8465336561203003, "learning_rate": 6.82558441672434e-06, "loss": 0.7504, "step": 4428 }, { "epoch": 1.84278397420898, "grad_norm": 1.757532000541687, "learning_rate": 6.8213361577629e-06, "loss": 0.8845, "step": 4429 }, { "epoch": 1.8431999584015806, "grad_norm": 1.7260587215423584, "learning_rate": 6.817088536860001e-06, "loss": 0.8711, "step": 4430 }, { "epoch": 1.8436159425941814, "grad_norm": 1.80439031124115, "learning_rate": 6.812841554868271e-06, "loss": 0.9267, "step": 4431 }, { "epoch": 1.8440319267867822, "grad_norm": 1.7008987665176392, "learning_rate": 6.808595212640217e-06, "loss": 0.8046, "step": 4432 }, { "epoch": 1.8444479109793828, "grad_norm": 1.714853286743164, "learning_rate": 6.804349511028212e-06, "loss": 0.6825, "step": 4433 }, { "epoch": 1.8448638951719833, "grad_norm": 1.744016170501709, "learning_rate": 6.800104450884506e-06, "loss": 0.7214, "step": 4434 }, { "epoch": 1.8452798793645842, "grad_norm": 1.7544296979904175, "learning_rate": 6.795860033061212e-06, "loss": 0.9648, "step": 4435 }, { "epoch": 1.845695863557185, "grad_norm": 1.6181858777999878, "learning_rate": 6.791616258410327e-06, "loss": 0.7979, "step": 4436 }, { "epoch": 1.8461118477497855, "grad_norm": 1.8404673337936401, "learning_rate": 6.7873731277837e-06, "loss": 0.7923, "step": 4437 }, { "epoch": 1.8465278319423861, "grad_norm": 1.7678735256195068, "learning_rate": 6.783130642033074e-06, "loss": 0.7481, "step": 4438 }, { "epoch": 1.8469438161349867, "grad_norm": 37.45534133911133, "learning_rate": 6.778888802010042e-06, "loss": 0.7381, "step": 4439 }, { "epoch": 1.8473598003275875, "grad_norm": 1.890650987625122, "learning_rate": 6.774647608566077e-06, "loss": 0.8189, "step": 4440 }, { "epoch": 1.8477757845201883, "grad_norm": 1.9341981410980225, "learning_rate": 6.770407062552524e-06, "loss": 0.8793, "step": 4441 }, { "epoch": 1.848191768712789, "grad_norm": 1.7927641868591309, "learning_rate": 6.76616716482059e-06, "loss": 0.7762, "step": 4442 }, { "epoch": 1.8486077529053895, "grad_norm": 1.790503740310669, "learning_rate": 6.761927916221362e-06, "loss": 0.7813, "step": 4443 }, { "epoch": 1.8490237370979903, "grad_norm": 2.028014659881592, "learning_rate": 6.757689317605789e-06, "loss": 0.8192, "step": 4444 }, { "epoch": 1.849439721290591, "grad_norm": 1.7747694253921509, "learning_rate": 6.753451369824693e-06, "loss": 0.8704, "step": 4445 }, { "epoch": 1.8498557054831917, "grad_norm": 1.7984524965286255, "learning_rate": 6.74921407372876e-06, "loss": 0.8892, "step": 4446 }, { "epoch": 1.8502716896757923, "grad_norm": 1.6169666051864624, "learning_rate": 6.744977430168558e-06, "loss": 0.6966, "step": 4447 }, { "epoch": 1.8506876738683928, "grad_norm": 1.7634642124176025, "learning_rate": 6.740741439994507e-06, "loss": 0.8005, "step": 4448 }, { "epoch": 1.8511036580609936, "grad_norm": 104.49154663085938, "learning_rate": 6.736506104056912e-06, "loss": 0.7695, "step": 4449 }, { "epoch": 1.8515196422535944, "grad_norm": 2.0997250080108643, "learning_rate": 6.732271423205929e-06, "loss": 0.8463, "step": 4450 }, { "epoch": 1.851935626446195, "grad_norm": 1.6600481271743774, "learning_rate": 6.728037398291605e-06, "loss": 0.8136, "step": 4451 }, { "epoch": 1.8523516106387956, "grad_norm": 1.759394884109497, "learning_rate": 6.723804030163832e-06, "loss": 0.8334, "step": 4452 }, { "epoch": 1.8527675948313964, "grad_norm": 1.7858418226242065, "learning_rate": 6.719571319672391e-06, "loss": 0.7894, "step": 4453 }, { "epoch": 1.8531835790239972, "grad_norm": 1.7820065021514893, "learning_rate": 6.7153392676669115e-06, "loss": 0.8374, "step": 4454 }, { "epoch": 1.8535995632165978, "grad_norm": 1.7097147703170776, "learning_rate": 6.711107874996905e-06, "loss": 0.7749, "step": 4455 }, { "epoch": 1.8540155474091984, "grad_norm": 1.8179808855056763, "learning_rate": 6.70687714251175e-06, "loss": 0.7989, "step": 4456 }, { "epoch": 1.854431531601799, "grad_norm": 1.821792483329773, "learning_rate": 6.702647071060679e-06, "loss": 0.8086, "step": 4457 }, { "epoch": 1.8548475157943998, "grad_norm": 1.7140603065490723, "learning_rate": 6.698417661492813e-06, "loss": 0.7875, "step": 4458 }, { "epoch": 1.8552634999870006, "grad_norm": 1.7345980405807495, "learning_rate": 6.694188914657119e-06, "loss": 0.7824, "step": 4459 }, { "epoch": 1.8556794841796012, "grad_norm": 1.8697868585586548, "learning_rate": 6.689960831402447e-06, "loss": 0.7536, "step": 4460 }, { "epoch": 1.8560954683722017, "grad_norm": 1.845673680305481, "learning_rate": 6.6857334125774985e-06, "loss": 0.8421, "step": 4461 }, { "epoch": 1.8565114525648025, "grad_norm": 1.7790262699127197, "learning_rate": 6.681506659030861e-06, "loss": 0.798, "step": 4462 }, { "epoch": 1.8569274367574033, "grad_norm": 1.7861062288284302, "learning_rate": 6.677280571610971e-06, "loss": 0.8622, "step": 4463 }, { "epoch": 1.857343420950004, "grad_norm": 1.7965693473815918, "learning_rate": 6.6730551511661414e-06, "loss": 0.7883, "step": 4464 }, { "epoch": 1.8577594051426045, "grad_norm": 1.6783826351165771, "learning_rate": 6.668830398544544e-06, "loss": 0.868, "step": 4465 }, { "epoch": 1.858175389335205, "grad_norm": 1.7647463083267212, "learning_rate": 6.6646063145942245e-06, "loss": 0.815, "step": 4466 }, { "epoch": 1.858591373527806, "grad_norm": 1.694556713104248, "learning_rate": 6.660382900163087e-06, "loss": 0.8492, "step": 4467 }, { "epoch": 1.8590073577204067, "grad_norm": 1.7668499946594238, "learning_rate": 6.6561601560989095e-06, "loss": 0.8956, "step": 4468 }, { "epoch": 1.8594233419130073, "grad_norm": 88.58216857910156, "learning_rate": 6.651938083249325e-06, "loss": 0.8361, "step": 4469 }, { "epoch": 1.8598393261056079, "grad_norm": 1.7998225688934326, "learning_rate": 6.64771668246184e-06, "loss": 0.9189, "step": 4470 }, { "epoch": 1.8602553102982087, "grad_norm": 1.8800164461135864, "learning_rate": 6.6434959545838254e-06, "loss": 0.8483, "step": 4471 }, { "epoch": 1.8606712944908095, "grad_norm": 1.8261089324951172, "learning_rate": 6.639275900462512e-06, "loss": 0.7113, "step": 4472 }, { "epoch": 1.86108727868341, "grad_norm": 2.2748212814331055, "learning_rate": 6.635056520945e-06, "loss": 0.8204, "step": 4473 }, { "epoch": 1.8615032628760106, "grad_norm": 1.748878002166748, "learning_rate": 6.630837816878249e-06, "loss": 0.8257, "step": 4474 }, { "epoch": 1.8619192470686112, "grad_norm": 1.7648364305496216, "learning_rate": 6.626619789109092e-06, "loss": 0.7495, "step": 4475 }, { "epoch": 1.862335231261212, "grad_norm": 1.810042142868042, "learning_rate": 6.622402438484214e-06, "loss": 0.8786, "step": 4476 }, { "epoch": 1.8627512154538128, "grad_norm": 1.7655357122421265, "learning_rate": 6.61818576585018e-06, "loss": 0.7832, "step": 4477 }, { "epoch": 1.8631671996464134, "grad_norm": 1.6718453168869019, "learning_rate": 6.613969772053401e-06, "loss": 0.668, "step": 4478 }, { "epoch": 1.863583183839014, "grad_norm": 1.664939284324646, "learning_rate": 6.6097544579401685e-06, "loss": 0.7135, "step": 4479 }, { "epoch": 1.8639991680316148, "grad_norm": 1.8144110441207886, "learning_rate": 6.60553982435662e-06, "loss": 0.8148, "step": 4480 }, { "epoch": 1.8644151522242156, "grad_norm": 1.7621792554855347, "learning_rate": 6.601325872148776e-06, "loss": 0.7468, "step": 4481 }, { "epoch": 1.8648311364168162, "grad_norm": 2.214750051498413, "learning_rate": 6.597112602162501e-06, "loss": 0.8319, "step": 4482 }, { "epoch": 1.8652471206094168, "grad_norm": 1.7689121961593628, "learning_rate": 6.59290001524354e-06, "loss": 0.7842, "step": 4483 }, { "epoch": 1.8656631048020174, "grad_norm": 2.1156113147735596, "learning_rate": 6.588688112237488e-06, "loss": 0.9733, "step": 4484 }, { "epoch": 1.8660790889946182, "grad_norm": 1.7338670492172241, "learning_rate": 6.5844768939898064e-06, "loss": 0.8571, "step": 4485 }, { "epoch": 1.866495073187219, "grad_norm": 1.664426326751709, "learning_rate": 6.580266361345825e-06, "loss": 0.7841, "step": 4486 }, { "epoch": 1.8669110573798195, "grad_norm": 1.9835891723632812, "learning_rate": 6.576056515150727e-06, "loss": 0.919, "step": 4487 }, { "epoch": 1.8673270415724201, "grad_norm": 1.8025723695755005, "learning_rate": 6.571847356249565e-06, "loss": 0.8641, "step": 4488 }, { "epoch": 1.867743025765021, "grad_norm": 1.9386087656021118, "learning_rate": 6.567638885487244e-06, "loss": 0.9294, "step": 4489 }, { "epoch": 1.8681590099576217, "grad_norm": 1.7720987796783447, "learning_rate": 6.563431103708548e-06, "loss": 0.785, "step": 4490 }, { "epoch": 1.8685749941502223, "grad_norm": 1.8916985988616943, "learning_rate": 6.559224011758101e-06, "loss": 0.8895, "step": 4491 }, { "epoch": 1.868990978342823, "grad_norm": 7.339920997619629, "learning_rate": 6.5550176104804075e-06, "loss": 0.88, "step": 4492 }, { "epoch": 1.8694069625354235, "grad_norm": 1.691626787185669, "learning_rate": 6.5508119007198226e-06, "loss": 0.8174, "step": 4493 }, { "epoch": 1.8698229467280243, "grad_norm": 1.6812275648117065, "learning_rate": 6.546606883320566e-06, "loss": 0.8894, "step": 4494 }, { "epoch": 1.870238930920625, "grad_norm": 1.726395845413208, "learning_rate": 6.542402559126715e-06, "loss": 0.899, "step": 4495 }, { "epoch": 1.8706549151132257, "grad_norm": 2.1370739936828613, "learning_rate": 6.5381989289822176e-06, "loss": 0.8856, "step": 4496 }, { "epoch": 1.8710708993058263, "grad_norm": 1.737644076347351, "learning_rate": 6.533995993730866e-06, "loss": 0.7683, "step": 4497 }, { "epoch": 1.871486883498427, "grad_norm": 1.7771127223968506, "learning_rate": 6.529793754216333e-06, "loss": 0.841, "step": 4498 }, { "epoch": 1.8719028676910279, "grad_norm": 1.5909227132797241, "learning_rate": 6.525592211282133e-06, "loss": 0.9017, "step": 4499 }, { "epoch": 1.8723188518836285, "grad_norm": 1.9222300052642822, "learning_rate": 6.52139136577165e-06, "loss": 0.8218, "step": 4500 }, { "epoch": 1.8723188518836285, "eval_loss": 0.7629926800727844, "eval_runtime": 1801.1932, "eval_samples_per_second": 3.659, "eval_steps_per_second": 1.83, "step": 4500 }, { "epoch": 1.872734836076229, "grad_norm": 1.7232954502105713, "learning_rate": 6.517191218528132e-06, "loss": 0.8313, "step": 4501 }, { "epoch": 1.8731508202688296, "grad_norm": 1.763482689857483, "learning_rate": 6.5129917703946764e-06, "loss": 0.8001, "step": 4502 }, { "epoch": 1.8735668044614304, "grad_norm": 1.708755373954773, "learning_rate": 6.508793022214248e-06, "loss": 0.7979, "step": 4503 }, { "epoch": 1.8739827886540312, "grad_norm": 1.8031039237976074, "learning_rate": 6.5045949748296655e-06, "loss": 0.7161, "step": 4504 }, { "epoch": 1.8743987728466318, "grad_norm": 1.8072373867034912, "learning_rate": 6.500397629083616e-06, "loss": 0.7979, "step": 4505 }, { "epoch": 1.8748147570392324, "grad_norm": 73.13491821289062, "learning_rate": 6.496200985818631e-06, "loss": 0.7669, "step": 4506 }, { "epoch": 1.8752307412318332, "grad_norm": 1.9066351652145386, "learning_rate": 6.49200504587712e-06, "loss": 0.8332, "step": 4507 }, { "epoch": 1.875646725424434, "grad_norm": 1.8360035419464111, "learning_rate": 6.487809810101333e-06, "loss": 0.8678, "step": 4508 }, { "epoch": 1.8760627096170346, "grad_norm": 1.7969022989273071, "learning_rate": 6.483615279333393e-06, "loss": 0.803, "step": 4509 }, { "epoch": 1.8764786938096352, "grad_norm": 1.8298695087432861, "learning_rate": 6.479421454415267e-06, "loss": 0.9183, "step": 4510 }, { "epoch": 1.8768946780022358, "grad_norm": 81.44860076904297, "learning_rate": 6.475228336188797e-06, "loss": 0.8816, "step": 4511 }, { "epoch": 1.8773106621948366, "grad_norm": 1.7661627531051636, "learning_rate": 6.471035925495669e-06, "loss": 0.8137, "step": 4512 }, { "epoch": 1.8777266463874374, "grad_norm": 1.8971067667007446, "learning_rate": 6.466844223177435e-06, "loss": 0.8348, "step": 4513 }, { "epoch": 1.878142630580038, "grad_norm": 1.7601948976516724, "learning_rate": 6.462653230075502e-06, "loss": 0.9114, "step": 4514 }, { "epoch": 1.8785586147726385, "grad_norm": 1.9925793409347534, "learning_rate": 6.458462947031132e-06, "loss": 0.9334, "step": 4515 }, { "epoch": 1.8789745989652393, "grad_norm": 1.6644550561904907, "learning_rate": 6.454273374885457e-06, "loss": 0.9104, "step": 4516 }, { "epoch": 1.8793905831578401, "grad_norm": 1.7054026126861572, "learning_rate": 6.4500845144794476e-06, "loss": 0.7843, "step": 4517 }, { "epoch": 1.8798065673504407, "grad_norm": 1.7233002185821533, "learning_rate": 6.445896366653945e-06, "loss": 0.7455, "step": 4518 }, { "epoch": 1.8802225515430413, "grad_norm": 1.8797773122787476, "learning_rate": 6.4417089322496395e-06, "loss": 0.8917, "step": 4519 }, { "epoch": 1.8806385357356419, "grad_norm": 1.8132562637329102, "learning_rate": 6.437522212107088e-06, "loss": 0.9314, "step": 4520 }, { "epoch": 1.8810545199282427, "grad_norm": 1.8534857034683228, "learning_rate": 6.433336207066691e-06, "loss": 0.8561, "step": 4521 }, { "epoch": 1.8814705041208435, "grad_norm": 1.7444969415664673, "learning_rate": 6.4291509179687185e-06, "loss": 0.8662, "step": 4522 }, { "epoch": 1.881886488313444, "grad_norm": 1.717119812965393, "learning_rate": 6.424966345653285e-06, "loss": 0.7391, "step": 4523 }, { "epoch": 1.8823024725060447, "grad_norm": 1.897773027420044, "learning_rate": 6.420782490960373e-06, "loss": 0.8152, "step": 4524 }, { "epoch": 1.8827184566986455, "grad_norm": 1.7393605709075928, "learning_rate": 6.416599354729807e-06, "loss": 0.7511, "step": 4525 }, { "epoch": 1.8831344408912463, "grad_norm": 1.7360979318618774, "learning_rate": 6.412416937801281e-06, "loss": 0.697, "step": 4526 }, { "epoch": 1.8835504250838468, "grad_norm": 1.7895511388778687, "learning_rate": 6.408235241014335e-06, "loss": 0.7662, "step": 4527 }, { "epoch": 1.8839664092764474, "grad_norm": 1.8203072547912598, "learning_rate": 6.404054265208375e-06, "loss": 0.7958, "step": 4528 }, { "epoch": 1.884382393469048, "grad_norm": 1.8748514652252197, "learning_rate": 6.399874011222642e-06, "loss": 0.882, "step": 4529 }, { "epoch": 1.8847983776616488, "grad_norm": 1.8293051719665527, "learning_rate": 6.3956944798962595e-06, "loss": 0.8352, "step": 4530 }, { "epoch": 1.8852143618542496, "grad_norm": 1.90829336643219, "learning_rate": 6.3915156720681826e-06, "loss": 0.8185, "step": 4531 }, { "epoch": 1.8856303460468502, "grad_norm": 88.091796875, "learning_rate": 6.387337588577234e-06, "loss": 0.9019, "step": 4532 }, { "epoch": 1.8860463302394508, "grad_norm": 1.846705436706543, "learning_rate": 6.383160230262089e-06, "loss": 0.809, "step": 4533 }, { "epoch": 1.8864623144320516, "grad_norm": 1.8145570755004883, "learning_rate": 6.3789835979612705e-06, "loss": 0.9004, "step": 4534 }, { "epoch": 1.8868782986246524, "grad_norm": 1.887487769126892, "learning_rate": 6.374807692513167e-06, "loss": 0.8548, "step": 4535 }, { "epoch": 1.887294282817253, "grad_norm": 9.66593074798584, "learning_rate": 6.3706325147560096e-06, "loss": 0.7308, "step": 4536 }, { "epoch": 1.8877102670098536, "grad_norm": 1.9420462846755981, "learning_rate": 6.366458065527895e-06, "loss": 0.8744, "step": 4537 }, { "epoch": 1.8881262512024541, "grad_norm": 2.056464910507202, "learning_rate": 6.362284345666763e-06, "loss": 0.8593, "step": 4538 }, { "epoch": 1.888542235395055, "grad_norm": 1.7694544792175293, "learning_rate": 6.358111356010412e-06, "loss": 0.8045, "step": 4539 }, { "epoch": 1.8889582195876558, "grad_norm": 1.6594352722167969, "learning_rate": 6.353939097396491e-06, "loss": 0.8914, "step": 4540 }, { "epoch": 1.8893742037802563, "grad_norm": 47.598785400390625, "learning_rate": 6.34976757066251e-06, "loss": 0.8975, "step": 4541 }, { "epoch": 1.889790187972857, "grad_norm": 1.753320574760437, "learning_rate": 6.345596776645821e-06, "loss": 0.8615, "step": 4542 }, { "epoch": 1.8902061721654577, "grad_norm": 1.6780407428741455, "learning_rate": 6.341426716183641e-06, "loss": 0.71, "step": 4543 }, { "epoch": 1.8906221563580585, "grad_norm": 1.7587701082229614, "learning_rate": 6.337257390113023e-06, "loss": 0.803, "step": 4544 }, { "epoch": 1.891038140550659, "grad_norm": 1.7063482999801636, "learning_rate": 6.333088799270895e-06, "loss": 0.8067, "step": 4545 }, { "epoch": 1.8914541247432597, "grad_norm": 1.6962864398956299, "learning_rate": 6.328920944494013e-06, "loss": 0.8515, "step": 4546 }, { "epoch": 1.8918701089358603, "grad_norm": 1.717970371246338, "learning_rate": 6.3247538266190074e-06, "loss": 0.7419, "step": 4547 }, { "epoch": 1.892286093128461, "grad_norm": 1.8513697385787964, "learning_rate": 6.320587446482349e-06, "loss": 0.8076, "step": 4548 }, { "epoch": 1.8927020773210619, "grad_norm": 43.303436279296875, "learning_rate": 6.3164218049203565e-06, "loss": 0.7609, "step": 4549 }, { "epoch": 1.8931180615136625, "grad_norm": 1.714493751525879, "learning_rate": 6.312256902769214e-06, "loss": 0.8503, "step": 4550 }, { "epoch": 1.893534045706263, "grad_norm": 1.9265661239624023, "learning_rate": 6.308092740864944e-06, "loss": 0.8587, "step": 4551 }, { "epoch": 1.8939500298988639, "grad_norm": 1.8266140222549438, "learning_rate": 6.303929320043428e-06, "loss": 0.972, "step": 4552 }, { "epoch": 1.8943660140914647, "grad_norm": 1.8420467376708984, "learning_rate": 6.299766641140392e-06, "loss": 0.856, "step": 4553 }, { "epoch": 1.8947819982840652, "grad_norm": 1.6500962972640991, "learning_rate": 6.295604704991427e-06, "loss": 0.7386, "step": 4554 }, { "epoch": 1.8951979824766658, "grad_norm": 1.8739954233169556, "learning_rate": 6.291443512431954e-06, "loss": 0.828, "step": 4555 }, { "epoch": 1.8956139666692664, "grad_norm": 1.7571443319320679, "learning_rate": 6.287283064297265e-06, "loss": 0.7961, "step": 4556 }, { "epoch": 1.8960299508618672, "grad_norm": 1.759351134300232, "learning_rate": 6.283123361422489e-06, "loss": 0.814, "step": 4557 }, { "epoch": 1.896445935054468, "grad_norm": 138.8590087890625, "learning_rate": 6.2789644046426155e-06, "loss": 0.739, "step": 4558 }, { "epoch": 1.8968619192470686, "grad_norm": 1.782804250717163, "learning_rate": 6.274806194792469e-06, "loss": 0.8594, "step": 4559 }, { "epoch": 1.8972779034396692, "grad_norm": 1.7830054759979248, "learning_rate": 6.2706487327067456e-06, "loss": 0.8396, "step": 4560 }, { "epoch": 1.89769388763227, "grad_norm": 1.7862961292266846, "learning_rate": 6.2664920192199685e-06, "loss": 0.822, "step": 4561 }, { "epoch": 1.8981098718248708, "grad_norm": 1.907006859779358, "learning_rate": 6.26233605516653e-06, "loss": 0.8539, "step": 4562 }, { "epoch": 1.8985258560174714, "grad_norm": 1.7674962282180786, "learning_rate": 6.258180841380663e-06, "loss": 0.7629, "step": 4563 }, { "epoch": 1.898941840210072, "grad_norm": 1.8374884128570557, "learning_rate": 6.254026378696445e-06, "loss": 0.8383, "step": 4564 }, { "epoch": 1.8993578244026725, "grad_norm": 1.5850248336791992, "learning_rate": 6.2498726679478165e-06, "loss": 0.703, "step": 4565 }, { "epoch": 1.8997738085952733, "grad_norm": 1.8461660146713257, "learning_rate": 6.245719709968552e-06, "loss": 0.7365, "step": 4566 }, { "epoch": 1.9001897927878741, "grad_norm": 2.09702467918396, "learning_rate": 6.241567505592285e-06, "loss": 0.8328, "step": 4567 }, { "epoch": 1.9006057769804747, "grad_norm": 1.7652767896652222, "learning_rate": 6.237416055652491e-06, "loss": 0.6842, "step": 4568 }, { "epoch": 1.9010217611730753, "grad_norm": 1.7559266090393066, "learning_rate": 6.233265360982503e-06, "loss": 0.7941, "step": 4569 }, { "epoch": 1.9014377453656761, "grad_norm": 1.717539668083191, "learning_rate": 6.2291154224154905e-06, "loss": 0.8006, "step": 4570 }, { "epoch": 1.901853729558277, "grad_norm": 2.468595027923584, "learning_rate": 6.2249662407844855e-06, "loss": 0.8501, "step": 4571 }, { "epoch": 1.9022697137508775, "grad_norm": 1.8021209239959717, "learning_rate": 6.220817816922353e-06, "loss": 0.8462, "step": 4572 }, { "epoch": 1.902685697943478, "grad_norm": 1.7248451709747314, "learning_rate": 6.216670151661819e-06, "loss": 0.861, "step": 4573 }, { "epoch": 1.9031016821360787, "grad_norm": 1.7598786354064941, "learning_rate": 6.212523245835444e-06, "loss": 0.8105, "step": 4574 }, { "epoch": 1.9035176663286795, "grad_norm": 1.912029504776001, "learning_rate": 6.208377100275651e-06, "loss": 0.9453, "step": 4575 }, { "epoch": 1.9039336505212803, "grad_norm": 1.8501770496368408, "learning_rate": 6.204231715814695e-06, "loss": 0.7895, "step": 4576 }, { "epoch": 1.9043496347138809, "grad_norm": 2850.239501953125, "learning_rate": 6.200087093284693e-06, "loss": 0.8679, "step": 4577 }, { "epoch": 1.9047656189064814, "grad_norm": 1.682174563407898, "learning_rate": 6.195943233517601e-06, "loss": 0.8835, "step": 4578 }, { "epoch": 1.9051816030990822, "grad_norm": 1.7742207050323486, "learning_rate": 6.191800137345218e-06, "loss": 0.812, "step": 4579 }, { "epoch": 1.905597587291683, "grad_norm": 1.7858830690383911, "learning_rate": 6.187657805599203e-06, "loss": 0.8394, "step": 4580 }, { "epoch": 1.9060135714842836, "grad_norm": 1.805820345878601, "learning_rate": 6.183516239111045e-06, "loss": 0.8821, "step": 4581 }, { "epoch": 1.9064295556768842, "grad_norm": 1.746946930885315, "learning_rate": 6.179375438712092e-06, "loss": 0.8387, "step": 4582 }, { "epoch": 1.9068455398694848, "grad_norm": 1.6857844591140747, "learning_rate": 6.1752354052335306e-06, "loss": 0.7356, "step": 4583 }, { "epoch": 1.9072615240620856, "grad_norm": 1.764841079711914, "learning_rate": 6.171096139506402e-06, "loss": 0.8292, "step": 4584 }, { "epoch": 1.9076775082546864, "grad_norm": 8.53031063079834, "learning_rate": 6.16695764236158e-06, "loss": 0.8206, "step": 4585 }, { "epoch": 1.908093492447287, "grad_norm": 1.730102777481079, "learning_rate": 6.162819914629802e-06, "loss": 0.8519, "step": 4586 }, { "epoch": 1.9085094766398876, "grad_norm": 1.7753382921218872, "learning_rate": 6.158682957141633e-06, "loss": 0.7823, "step": 4587 }, { "epoch": 1.9089254608324884, "grad_norm": 1.7652316093444824, "learning_rate": 6.154546770727496e-06, "loss": 0.7995, "step": 4588 }, { "epoch": 1.9093414450250892, "grad_norm": 1.7533533573150635, "learning_rate": 6.150411356217652e-06, "loss": 0.7112, "step": 4589 }, { "epoch": 1.9097574292176898, "grad_norm": 1.8609060049057007, "learning_rate": 6.146276714442212e-06, "loss": 0.8405, "step": 4590 }, { "epoch": 1.9101734134102903, "grad_norm": 1.8712643384933472, "learning_rate": 6.142142846231126e-06, "loss": 0.8258, "step": 4591 }, { "epoch": 1.910589397602891, "grad_norm": 43.56181716918945, "learning_rate": 6.138009752414193e-06, "loss": 0.8861, "step": 4592 }, { "epoch": 1.9110053817954917, "grad_norm": 1.7939025163650513, "learning_rate": 6.1338774338210625e-06, "loss": 0.8989, "step": 4593 }, { "epoch": 1.9114213659880925, "grad_norm": 1.965701699256897, "learning_rate": 6.129745891281211e-06, "loss": 0.8041, "step": 4594 }, { "epoch": 1.9118373501806931, "grad_norm": 2.0614166259765625, "learning_rate": 6.125615125623981e-06, "loss": 0.8793, "step": 4595 }, { "epoch": 1.9122533343732937, "grad_norm": 1.6847286224365234, "learning_rate": 6.1214851376785415e-06, "loss": 0.7132, "step": 4596 }, { "epoch": 1.9126693185658945, "grad_norm": 1.8999534845352173, "learning_rate": 6.117355928273914e-06, "loss": 0.9278, "step": 4597 }, { "epoch": 1.9130853027584953, "grad_norm": 1.838029384613037, "learning_rate": 6.113227498238957e-06, "loss": 0.8326, "step": 4598 }, { "epoch": 1.913501286951096, "grad_norm": 1.71394681930542, "learning_rate": 6.109099848402385e-06, "loss": 0.7882, "step": 4599 }, { "epoch": 1.9139172711436965, "grad_norm": 1.818638801574707, "learning_rate": 6.1049729795927404e-06, "loss": 0.8409, "step": 4600 }, { "epoch": 1.914333255336297, "grad_norm": 3.000781536102295, "learning_rate": 6.1008468926384235e-06, "loss": 0.6913, "step": 4601 }, { "epoch": 1.9147492395288979, "grad_norm": 1.7175190448760986, "learning_rate": 6.096721588367667e-06, "loss": 0.7633, "step": 4602 }, { "epoch": 1.9151652237214987, "grad_norm": 1.645021677017212, "learning_rate": 6.092597067608552e-06, "loss": 0.8179, "step": 4603 }, { "epoch": 1.9155812079140992, "grad_norm": 1.9909921884536743, "learning_rate": 6.088473331188994e-06, "loss": 0.8672, "step": 4604 }, { "epoch": 1.9159971921066998, "grad_norm": 1.820976734161377, "learning_rate": 6.084350379936768e-06, "loss": 0.8181, "step": 4605 }, { "epoch": 1.9164131762993006, "grad_norm": 1.8347878456115723, "learning_rate": 6.080228214679472e-06, "loss": 0.8461, "step": 4606 }, { "epoch": 1.9168291604919014, "grad_norm": 1.7712825536727905, "learning_rate": 6.076106836244558e-06, "loss": 0.7544, "step": 4607 }, { "epoch": 1.917245144684502, "grad_norm": 2.018127679824829, "learning_rate": 6.071986245459322e-06, "loss": 0.872, "step": 4608 }, { "epoch": 1.9176611288771026, "grad_norm": 1.7912585735321045, "learning_rate": 6.067866443150888e-06, "loss": 0.801, "step": 4609 }, { "epoch": 1.9180771130697032, "grad_norm": 1.7744344472885132, "learning_rate": 6.063747430146241e-06, "loss": 0.87, "step": 4610 }, { "epoch": 1.918493097262304, "grad_norm": 1.6652346849441528, "learning_rate": 6.059629207272191e-06, "loss": 0.877, "step": 4611 }, { "epoch": 1.9189090814549048, "grad_norm": 1.7539687156677246, "learning_rate": 6.055511775355397e-06, "loss": 0.8452, "step": 4612 }, { "epoch": 1.9193250656475054, "grad_norm": 3.5819880962371826, "learning_rate": 6.0513951352223555e-06, "loss": 0.8706, "step": 4613 }, { "epoch": 1.919741049840106, "grad_norm": 1.7415680885314941, "learning_rate": 6.0472792876994125e-06, "loss": 0.8929, "step": 4614 }, { "epoch": 1.9201570340327068, "grad_norm": 1.5987612009048462, "learning_rate": 6.043164233612743e-06, "loss": 0.7451, "step": 4615 }, { "epoch": 1.9205730182253076, "grad_norm": 1.6806024312973022, "learning_rate": 6.039049973788374e-06, "loss": 0.8534, "step": 4616 }, { "epoch": 1.9209890024179082, "grad_norm": 1.8601983785629272, "learning_rate": 6.034936509052165e-06, "loss": 0.8264, "step": 4617 }, { "epoch": 1.9214049866105087, "grad_norm": 1.6775031089782715, "learning_rate": 6.0308238402298195e-06, "loss": 0.8174, "step": 4618 }, { "epoch": 1.9218209708031093, "grad_norm": 2.8121259212493896, "learning_rate": 6.026711968146877e-06, "loss": 0.8188, "step": 4619 }, { "epoch": 1.9222369549957101, "grad_norm": 1.7420165538787842, "learning_rate": 6.022600893628726e-06, "loss": 0.9408, "step": 4620 }, { "epoch": 1.922652939188311, "grad_norm": 1.9374606609344482, "learning_rate": 6.018490617500586e-06, "loss": 0.7982, "step": 4621 }, { "epoch": 1.9230689233809115, "grad_norm": 1.8302494287490845, "learning_rate": 6.0143811405875175e-06, "loss": 0.7991, "step": 4622 }, { "epoch": 1.923484907573512, "grad_norm": 1.7289400100708008, "learning_rate": 6.01027246371443e-06, "loss": 0.7755, "step": 4623 }, { "epoch": 1.923900891766113, "grad_norm": 1.8569409847259521, "learning_rate": 6.006164587706057e-06, "loss": 0.8385, "step": 4624 }, { "epoch": 1.9243168759587137, "grad_norm": 1.7816511392593384, "learning_rate": 6.002057513386987e-06, "loss": 0.8098, "step": 4625 }, { "epoch": 1.9247328601513143, "grad_norm": 1.6537622213363647, "learning_rate": 5.997951241581634e-06, "loss": 0.8267, "step": 4626 }, { "epoch": 1.9251488443439149, "grad_norm": 1.8301963806152344, "learning_rate": 5.993845773114262e-06, "loss": 0.7877, "step": 4627 }, { "epoch": 1.9255648285365154, "grad_norm": 1.9166117906570435, "learning_rate": 5.98974110880896e-06, "loss": 0.9066, "step": 4628 }, { "epoch": 1.9259808127291163, "grad_norm": 1.928674578666687, "learning_rate": 5.9856372494896755e-06, "loss": 0.7354, "step": 4629 }, { "epoch": 1.926396796921717, "grad_norm": 1.735756516456604, "learning_rate": 5.981534195980173e-06, "loss": 0.8912, "step": 4630 }, { "epoch": 1.9268127811143176, "grad_norm": 2.539865493774414, "learning_rate": 5.9774319491040736e-06, "loss": 0.9259, "step": 4631 }, { "epoch": 1.9272287653069182, "grad_norm": 1.8846416473388672, "learning_rate": 5.973330509684818e-06, "loss": 0.8323, "step": 4632 }, { "epoch": 1.927644749499519, "grad_norm": 1.8310856819152832, "learning_rate": 5.969229878545706e-06, "loss": 0.8781, "step": 4633 }, { "epoch": 1.9280607336921198, "grad_norm": 1.7956186532974243, "learning_rate": 5.965130056509856e-06, "loss": 0.7721, "step": 4634 }, { "epoch": 1.9284767178847204, "grad_norm": 1.7988083362579346, "learning_rate": 5.9610310444002406e-06, "loss": 0.8287, "step": 4635 }, { "epoch": 1.928892702077321, "grad_norm": 1.7538644075393677, "learning_rate": 5.956932843039653e-06, "loss": 0.8162, "step": 4636 }, { "epoch": 1.9293086862699216, "grad_norm": 2.2538161277770996, "learning_rate": 5.952835453250733e-06, "loss": 0.7465, "step": 4637 }, { "epoch": 1.9297246704625224, "grad_norm": 1971.006591796875, "learning_rate": 5.948738875855964e-06, "loss": 0.7181, "step": 4638 }, { "epoch": 1.9301406546551232, "grad_norm": 3.318610906600952, "learning_rate": 5.944643111677649e-06, "loss": 0.8221, "step": 4639 }, { "epoch": 1.9305566388477238, "grad_norm": 3.295353889465332, "learning_rate": 5.940548161537947e-06, "loss": 0.8482, "step": 4640 }, { "epoch": 1.9309726230403244, "grad_norm": 1.7388478517532349, "learning_rate": 5.936454026258838e-06, "loss": 0.7652, "step": 4641 }, { "epoch": 1.9313886072329252, "grad_norm": 1.8539403676986694, "learning_rate": 5.932360706662148e-06, "loss": 0.8428, "step": 4642 }, { "epoch": 1.931804591425526, "grad_norm": 5.32097864151001, "learning_rate": 5.92826820356953e-06, "loss": 0.7877, "step": 4643 }, { "epoch": 1.9322205756181265, "grad_norm": 1.930948257446289, "learning_rate": 5.924176517802489e-06, "loss": 0.8862, "step": 4644 }, { "epoch": 1.9326365598107271, "grad_norm": 1.9418379068374634, "learning_rate": 5.9200856501823456e-06, "loss": 0.9281, "step": 4645 }, { "epoch": 1.9330525440033277, "grad_norm": 1.7598798274993896, "learning_rate": 5.915995601530276e-06, "loss": 0.7738, "step": 4646 }, { "epoch": 1.9334685281959285, "grad_norm": 1.8925104141235352, "learning_rate": 5.911906372667271e-06, "loss": 0.846, "step": 4647 }, { "epoch": 1.9338845123885293, "grad_norm": 15.122740745544434, "learning_rate": 5.907817964414182e-06, "loss": 0.8033, "step": 4648 }, { "epoch": 1.93430049658113, "grad_norm": 87.35289764404297, "learning_rate": 5.903730377591669e-06, "loss": 0.89, "step": 4649 }, { "epoch": 1.9347164807737305, "grad_norm": 1.8831645250320435, "learning_rate": 5.899643613020252e-06, "loss": 0.8811, "step": 4650 }, { "epoch": 1.9351324649663313, "grad_norm": 1.7580517530441284, "learning_rate": 5.895557671520265e-06, "loss": 0.785, "step": 4651 }, { "epoch": 1.935548449158932, "grad_norm": 1.895778775215149, "learning_rate": 5.891472553911892e-06, "loss": 0.9759, "step": 4652 }, { "epoch": 1.9359644333515327, "grad_norm": 1.9631456136703491, "learning_rate": 5.887388261015139e-06, "loss": 0.84, "step": 4653 }, { "epoch": 1.9363804175441333, "grad_norm": 1.6906135082244873, "learning_rate": 5.88330479364986e-06, "loss": 0.7841, "step": 4654 }, { "epoch": 1.9367964017367338, "grad_norm": 4.710916996002197, "learning_rate": 5.8792221526357315e-06, "loss": 0.8888, "step": 4655 }, { "epoch": 1.9372123859293346, "grad_norm": 2.270496368408203, "learning_rate": 5.875140338792271e-06, "loss": 0.8427, "step": 4656 }, { "epoch": 1.9376283701219355, "grad_norm": 1.9128715991973877, "learning_rate": 5.8710593529388305e-06, "loss": 0.8616, "step": 4657 }, { "epoch": 1.938044354314536, "grad_norm": 1.8118999004364014, "learning_rate": 5.866979195894585e-06, "loss": 0.8597, "step": 4658 }, { "epoch": 1.9384603385071366, "grad_norm": 1.7100214958190918, "learning_rate": 5.862899868478562e-06, "loss": 0.8103, "step": 4659 }, { "epoch": 1.9388763226997374, "grad_norm": 1.9001438617706299, "learning_rate": 5.858821371509602e-06, "loss": 0.7637, "step": 4660 }, { "epoch": 1.9392923068923382, "grad_norm": 1.8713675737380981, "learning_rate": 5.854743705806396e-06, "loss": 0.8946, "step": 4661 }, { "epoch": 1.9397082910849388, "grad_norm": 1.864347219467163, "learning_rate": 5.850666872187454e-06, "loss": 0.738, "step": 4662 }, { "epoch": 1.9401242752775394, "grad_norm": 1.9779021739959717, "learning_rate": 5.846590871471132e-06, "loss": 0.8942, "step": 4663 }, { "epoch": 1.94054025947014, "grad_norm": 1.6783077716827393, "learning_rate": 5.842515704475605e-06, "loss": 0.7671, "step": 4664 }, { "epoch": 1.9409562436627408, "grad_norm": 1.984713077545166, "learning_rate": 5.838441372018898e-06, "loss": 0.7411, "step": 4665 }, { "epoch": 1.9413722278553416, "grad_norm": 1.6299042701721191, "learning_rate": 5.834367874918849e-06, "loss": 0.6974, "step": 4666 }, { "epoch": 1.9417882120479422, "grad_norm": 1.803460717201233, "learning_rate": 5.830295213993147e-06, "loss": 0.8182, "step": 4667 }, { "epoch": 1.9422041962405427, "grad_norm": 1.8835138082504272, "learning_rate": 5.826223390059298e-06, "loss": 0.7988, "step": 4668 }, { "epoch": 1.9426201804331436, "grad_norm": 1.862457036972046, "learning_rate": 5.8221524039346434e-06, "loss": 0.9362, "step": 4669 }, { "epoch": 1.9430361646257444, "grad_norm": 1.7747650146484375, "learning_rate": 5.818082256436367e-06, "loss": 0.7853, "step": 4670 }, { "epoch": 1.943452148818345, "grad_norm": 19.78326416015625, "learning_rate": 5.814012948381469e-06, "loss": 0.8708, "step": 4671 }, { "epoch": 1.9438681330109455, "grad_norm": 1.830899953842163, "learning_rate": 5.809944480586795e-06, "loss": 0.7678, "step": 4672 }, { "epoch": 1.944284117203546, "grad_norm": 1.7888578176498413, "learning_rate": 5.805876853869009e-06, "loss": 0.8198, "step": 4673 }, { "epoch": 1.944700101396147, "grad_norm": 1.8711459636688232, "learning_rate": 5.80181006904462e-06, "loss": 0.8113, "step": 4674 }, { "epoch": 1.9451160855887477, "grad_norm": 1.8629405498504639, "learning_rate": 5.797744126929951e-06, "loss": 0.8405, "step": 4675 }, { "epoch": 1.9455320697813483, "grad_norm": 1.8058183193206787, "learning_rate": 5.793679028341177e-06, "loss": 0.8196, "step": 4676 }, { "epoch": 1.9459480539739489, "grad_norm": 1.750145673751831, "learning_rate": 5.789614774094284e-06, "loss": 0.8991, "step": 4677 }, { "epoch": 1.9463640381665497, "grad_norm": 1.660431146621704, "learning_rate": 5.7855513650051e-06, "loss": 0.7952, "step": 4678 }, { "epoch": 1.9467800223591505, "grad_norm": 1.7190836668014526, "learning_rate": 5.781488801889272e-06, "loss": 0.831, "step": 4679 }, { "epoch": 1.947196006551751, "grad_norm": 1.7660493850708008, "learning_rate": 5.777427085562297e-06, "loss": 0.7834, "step": 4680 }, { "epoch": 1.9476119907443517, "grad_norm": 2.0496957302093506, "learning_rate": 5.773366216839481e-06, "loss": 0.8581, "step": 4681 }, { "epoch": 1.9480279749369522, "grad_norm": 1.7528939247131348, "learning_rate": 5.769306196535976e-06, "loss": 0.7845, "step": 4682 }, { "epoch": 1.948443959129553, "grad_norm": 1.7817411422729492, "learning_rate": 5.765247025466749e-06, "loss": 0.916, "step": 4683 }, { "epoch": 1.9488599433221538, "grad_norm": 1.9603888988494873, "learning_rate": 5.76118870444661e-06, "loss": 0.8722, "step": 4684 }, { "epoch": 1.9492759275147544, "grad_norm": 2.00274395942688, "learning_rate": 5.757131234290194e-06, "loss": 0.8878, "step": 4685 }, { "epoch": 1.949691911707355, "grad_norm": 1.7459639310836792, "learning_rate": 5.753074615811958e-06, "loss": 0.8552, "step": 4686 }, { "epoch": 1.9501078958999558, "grad_norm": 1.7666243314743042, "learning_rate": 5.749018849826199e-06, "loss": 0.8181, "step": 4687 }, { "epoch": 1.9505238800925566, "grad_norm": 22.337570190429688, "learning_rate": 5.7449639371470364e-06, "loss": 0.7494, "step": 4688 }, { "epoch": 1.9509398642851572, "grad_norm": 1.7568635940551758, "learning_rate": 5.7409098785884185e-06, "loss": 0.7163, "step": 4689 }, { "epoch": 1.9513558484777578, "grad_norm": 1.7554243803024292, "learning_rate": 5.736856674964121e-06, "loss": 0.7626, "step": 4690 }, { "epoch": 1.9517718326703584, "grad_norm": 1.9779930114746094, "learning_rate": 5.732804327087756e-06, "loss": 0.9881, "step": 4691 }, { "epoch": 1.9521878168629592, "grad_norm": 1.6700345277786255, "learning_rate": 5.728752835772751e-06, "loss": 0.7275, "step": 4692 }, { "epoch": 1.95260380105556, "grad_norm": 1.7475676536560059, "learning_rate": 5.724702201832376e-06, "loss": 0.9139, "step": 4693 }, { "epoch": 1.9530197852481606, "grad_norm": 10.163490295410156, "learning_rate": 5.720652426079716e-06, "loss": 0.9342, "step": 4694 }, { "epoch": 1.9534357694407611, "grad_norm": 1.7440359592437744, "learning_rate": 5.716603509327694e-06, "loss": 0.7316, "step": 4695 }, { "epoch": 1.953851753633362, "grad_norm": 1.80363929271698, "learning_rate": 5.712555452389049e-06, "loss": 0.7395, "step": 4696 }, { "epoch": 1.9542677378259627, "grad_norm": 2.3247315883636475, "learning_rate": 5.708508256076364e-06, "loss": 0.7591, "step": 4697 }, { "epoch": 1.9546837220185633, "grad_norm": 1.8698327541351318, "learning_rate": 5.704461921202034e-06, "loss": 0.8544, "step": 4698 }, { "epoch": 1.955099706211164, "grad_norm": 2.8778445720672607, "learning_rate": 5.700416448578281e-06, "loss": 0.7097, "step": 4699 }, { "epoch": 1.9555156904037645, "grad_norm": 1.8974560499191284, "learning_rate": 5.6963718390171704e-06, "loss": 0.8468, "step": 4700 }, { "epoch": 1.9559316745963653, "grad_norm": 1.792360544204712, "learning_rate": 5.692328093330575e-06, "loss": 0.7684, "step": 4701 }, { "epoch": 1.956347658788966, "grad_norm": 1.7890552282333374, "learning_rate": 5.688285212330209e-06, "loss": 0.8805, "step": 4702 }, { "epoch": 1.9567636429815667, "grad_norm": 1.7841875553131104, "learning_rate": 5.684243196827601e-06, "loss": 0.795, "step": 4703 }, { "epoch": 1.9571796271741673, "grad_norm": 1.7025223970413208, "learning_rate": 5.680202047634118e-06, "loss": 0.7548, "step": 4704 }, { "epoch": 1.957595611366768, "grad_norm": 1.7035505771636963, "learning_rate": 5.676161765560939e-06, "loss": 0.64, "step": 4705 }, { "epoch": 1.9580115955593689, "grad_norm": 1.7789874076843262, "learning_rate": 5.6721223514190845e-06, "loss": 0.8001, "step": 4706 }, { "epoch": 1.9584275797519695, "grad_norm": 1.7881348133087158, "learning_rate": 5.6680838060193885e-06, "loss": 0.8116, "step": 4707 }, { "epoch": 1.95884356394457, "grad_norm": 842.0196533203125, "learning_rate": 5.664046130172516e-06, "loss": 0.8187, "step": 4708 }, { "epoch": 1.9592595481371706, "grad_norm": 1.834222674369812, "learning_rate": 5.660009324688952e-06, "loss": 0.7939, "step": 4709 }, { "epoch": 1.9596755323297714, "grad_norm": 1.9539494514465332, "learning_rate": 5.655973390379019e-06, "loss": 0.7657, "step": 4710 }, { "epoch": 1.9600915165223722, "grad_norm": 1.8049155473709106, "learning_rate": 5.651938328052849e-06, "loss": 0.7711, "step": 4711 }, { "epoch": 1.9605075007149728, "grad_norm": 1.776656150817871, "learning_rate": 5.6479041385204155e-06, "loss": 0.7994, "step": 4712 }, { "epoch": 1.9609234849075734, "grad_norm": 1.781558632850647, "learning_rate": 5.6438708225915e-06, "loss": 0.8691, "step": 4713 }, { "epoch": 1.9613394691001742, "grad_norm": 1.7942392826080322, "learning_rate": 5.63983838107572e-06, "loss": 0.9043, "step": 4714 }, { "epoch": 1.961755453292775, "grad_norm": 2.0650248527526855, "learning_rate": 5.6358068147825185e-06, "loss": 0.8598, "step": 4715 }, { "epoch": 1.9621714374853756, "grad_norm": 6.307229518890381, "learning_rate": 5.631776124521153e-06, "loss": 0.8643, "step": 4716 }, { "epoch": 1.9625874216779762, "grad_norm": 1.6992573738098145, "learning_rate": 5.627746311100715e-06, "loss": 0.8043, "step": 4717 }, { "epoch": 1.9630034058705768, "grad_norm": 1.9427956342697144, "learning_rate": 5.623717375330109e-06, "loss": 0.84, "step": 4718 }, { "epoch": 1.9634193900631776, "grad_norm": 1.8902758359909058, "learning_rate": 5.6196893180180775e-06, "loss": 0.8536, "step": 4719 }, { "epoch": 1.9638353742557784, "grad_norm": 1.7999175786972046, "learning_rate": 5.6156621399731725e-06, "loss": 0.8274, "step": 4720 }, { "epoch": 1.964251358448379, "grad_norm": 1.8970597982406616, "learning_rate": 5.611635842003783e-06, "loss": 0.7914, "step": 4721 }, { "epoch": 1.9646673426409795, "grad_norm": 1.6943806409835815, "learning_rate": 5.6076104249181075e-06, "loss": 0.7926, "step": 4722 }, { "epoch": 1.9650833268335803, "grad_norm": 1.8993873596191406, "learning_rate": 5.603585889524184e-06, "loss": 0.7832, "step": 4723 }, { "epoch": 1.9654993110261811, "grad_norm": 6.185051441192627, "learning_rate": 5.5995622366298515e-06, "loss": 0.8126, "step": 4724 }, { "epoch": 1.9659152952187817, "grad_norm": 3.9651806354522705, "learning_rate": 5.595539467042799e-06, "loss": 0.8553, "step": 4725 }, { "epoch": 1.9663312794113823, "grad_norm": 1.852326512336731, "learning_rate": 5.591517581570511e-06, "loss": 0.8436, "step": 4726 }, { "epoch": 1.9667472636039829, "grad_norm": 1.8793628215789795, "learning_rate": 5.587496581020317e-06, "loss": 0.888, "step": 4727 }, { "epoch": 1.9671632477965837, "grad_norm": 1.6743497848510742, "learning_rate": 5.583476466199357e-06, "loss": 0.8559, "step": 4728 }, { "epoch": 1.9675792319891845, "grad_norm": 1.7546536922454834, "learning_rate": 5.5794572379145875e-06, "loss": 0.8695, "step": 4729 }, { "epoch": 1.967995216181785, "grad_norm": 1.8800259828567505, "learning_rate": 5.575438896972807e-06, "loss": 0.8395, "step": 4730 }, { "epoch": 1.9684112003743857, "grad_norm": 1.7974153757095337, "learning_rate": 5.571421444180613e-06, "loss": 0.8471, "step": 4731 }, { "epoch": 1.9688271845669865, "grad_norm": 1.7663078308105469, "learning_rate": 5.5674048803444444e-06, "loss": 0.7474, "step": 4732 }, { "epoch": 1.9692431687595873, "grad_norm": 1.9058936834335327, "learning_rate": 5.563389206270544e-06, "loss": 0.7473, "step": 4733 }, { "epoch": 1.9696591529521879, "grad_norm": 1.8480446338653564, "learning_rate": 5.5593744227649955e-06, "loss": 0.857, "step": 4734 }, { "epoch": 1.9700751371447884, "grad_norm": 1.7388732433319092, "learning_rate": 5.555360530633682e-06, "loss": 0.7889, "step": 4735 }, { "epoch": 1.970491121337389, "grad_norm": 1.872458815574646, "learning_rate": 5.55134753068233e-06, "loss": 0.8748, "step": 4736 }, { "epoch": 1.9709071055299898, "grad_norm": 1.8591669797897339, "learning_rate": 5.547335423716467e-06, "loss": 0.9587, "step": 4737 }, { "epoch": 1.9713230897225906, "grad_norm": 1.7652943134307861, "learning_rate": 5.543324210541454e-06, "loss": 0.7362, "step": 4738 }, { "epoch": 1.9717390739151912, "grad_norm": 1.731640338897705, "learning_rate": 5.539313891962466e-06, "loss": 0.803, "step": 4739 }, { "epoch": 1.9721550581077918, "grad_norm": 1.8329331874847412, "learning_rate": 5.535304468784504e-06, "loss": 0.7723, "step": 4740 }, { "epoch": 1.9725710423003926, "grad_norm": 1.7829349040985107, "learning_rate": 5.53129594181238e-06, "loss": 0.6472, "step": 4741 }, { "epoch": 1.9729870264929934, "grad_norm": 1.733221411705017, "learning_rate": 5.527288311850742e-06, "loss": 0.797, "step": 4742 }, { "epoch": 1.973403010685594, "grad_norm": 1.7768824100494385, "learning_rate": 5.52328157970404e-06, "loss": 0.8678, "step": 4743 }, { "epoch": 1.9738189948781946, "grad_norm": 1.8411287069320679, "learning_rate": 5.5192757461765544e-06, "loss": 0.7955, "step": 4744 }, { "epoch": 1.9742349790707951, "grad_norm": 1.810996651649475, "learning_rate": 5.515270812072388e-06, "loss": 0.8203, "step": 4745 }, { "epoch": 1.974650963263396, "grad_norm": 2.104454517364502, "learning_rate": 5.511266778195454e-06, "loss": 0.8223, "step": 4746 }, { "epoch": 1.9750669474559968, "grad_norm": 1.7398275136947632, "learning_rate": 5.50726364534949e-06, "loss": 0.8068, "step": 4747 }, { "epoch": 1.9754829316485973, "grad_norm": 1.8697999715805054, "learning_rate": 5.503261414338046e-06, "loss": 0.9373, "step": 4748 }, { "epoch": 1.975898915841198, "grad_norm": 1.8494895696640015, "learning_rate": 5.499260085964504e-06, "loss": 0.8308, "step": 4749 }, { "epoch": 1.9763149000337987, "grad_norm": 1.7706698179244995, "learning_rate": 5.495259661032051e-06, "loss": 0.8, "step": 4750 }, { "epoch": 1.9767308842263995, "grad_norm": 1.881932020187378, "learning_rate": 5.491260140343706e-06, "loss": 0.7892, "step": 4751 }, { "epoch": 1.9771468684190001, "grad_norm": 2.0037338733673096, "learning_rate": 5.487261524702292e-06, "loss": 0.8966, "step": 4752 }, { "epoch": 1.9775628526116007, "grad_norm": 52.19820785522461, "learning_rate": 5.483263814910465e-06, "loss": 0.8854, "step": 4753 }, { "epoch": 1.9779788368042013, "grad_norm": 1.8220257759094238, "learning_rate": 5.479267011770685e-06, "loss": 0.8256, "step": 4754 }, { "epoch": 1.978394820996802, "grad_norm": 1.8518949747085571, "learning_rate": 5.475271116085244e-06, "loss": 0.8895, "step": 4755 }, { "epoch": 1.9788108051894029, "grad_norm": 1.7029731273651123, "learning_rate": 5.471276128656242e-06, "loss": 0.7439, "step": 4756 }, { "epoch": 1.9792267893820035, "grad_norm": 1.8319873809814453, "learning_rate": 5.467282050285601e-06, "loss": 0.7751, "step": 4757 }, { "epoch": 1.979642773574604, "grad_norm": 2.20188045501709, "learning_rate": 5.463288881775051e-06, "loss": 0.8765, "step": 4758 }, { "epoch": 1.9800587577672049, "grad_norm": 4.634256362915039, "learning_rate": 5.459296623926153e-06, "loss": 0.7158, "step": 4759 }, { "epoch": 1.9804747419598057, "grad_norm": 267.07928466796875, "learning_rate": 5.4553052775402855e-06, "loss": 0.7372, "step": 4760 }, { "epoch": 1.9808907261524062, "grad_norm": 1.840877890586853, "learning_rate": 5.4513148434186295e-06, "loss": 0.8832, "step": 4761 }, { "epoch": 1.9813067103450068, "grad_norm": 1.7288892269134521, "learning_rate": 5.447325322362198e-06, "loss": 0.798, "step": 4762 }, { "epoch": 1.9817226945376074, "grad_norm": 1.885305643081665, "learning_rate": 5.4433367151718074e-06, "loss": 0.7519, "step": 4763 }, { "epoch": 1.9821386787302082, "grad_norm": 1.7587405443191528, "learning_rate": 5.439349022648106e-06, "loss": 0.737, "step": 4764 }, { "epoch": 1.982554662922809, "grad_norm": 1.640037178993225, "learning_rate": 5.43536224559154e-06, "loss": 0.667, "step": 4765 }, { "epoch": 1.9829706471154096, "grad_norm": 1.9749598503112793, "learning_rate": 5.431376384802393e-06, "loss": 0.8863, "step": 4766 }, { "epoch": 1.9833866313080102, "grad_norm": 1.6344271898269653, "learning_rate": 5.427391441080746e-06, "loss": 0.6908, "step": 4767 }, { "epoch": 1.983802615500611, "grad_norm": 1.7745684385299683, "learning_rate": 5.423407415226506e-06, "loss": 0.7666, "step": 4768 }, { "epoch": 1.9842185996932118, "grad_norm": 1.7188727855682373, "learning_rate": 5.419424308039389e-06, "loss": 0.7748, "step": 4769 }, { "epoch": 1.9846345838858124, "grad_norm": 1.8370354175567627, "learning_rate": 5.415442120318937e-06, "loss": 0.9104, "step": 4770 }, { "epoch": 1.985050568078413, "grad_norm": 1.884287714958191, "learning_rate": 5.411460852864497e-06, "loss": 0.8356, "step": 4771 }, { "epoch": 1.9854665522710135, "grad_norm": 2.132678508758545, "learning_rate": 5.407480506475238e-06, "loss": 0.8583, "step": 4772 }, { "epoch": 1.9858825364636143, "grad_norm": 1.7735166549682617, "learning_rate": 5.403501081950138e-06, "loss": 0.7827, "step": 4773 }, { "epoch": 1.9862985206562151, "grad_norm": 1.8934780359268188, "learning_rate": 5.399522580087999e-06, "loss": 0.8631, "step": 4774 }, { "epoch": 1.9867145048488157, "grad_norm": 1.9080955982208252, "learning_rate": 5.395545001687428e-06, "loss": 0.769, "step": 4775 }, { "epoch": 1.9871304890414163, "grad_norm": 6.7178850173950195, "learning_rate": 5.391568347546854e-06, "loss": 0.7655, "step": 4776 }, { "epoch": 1.9875464732340171, "grad_norm": 1.914642095565796, "learning_rate": 5.387592618464518e-06, "loss": 0.8674, "step": 4777 }, { "epoch": 1.987962457426618, "grad_norm": 1.7501585483551025, "learning_rate": 5.383617815238468e-06, "loss": 0.8069, "step": 4778 }, { "epoch": 1.9883784416192185, "grad_norm": 1.824456810951233, "learning_rate": 5.379643938666581e-06, "loss": 0.9043, "step": 4779 }, { "epoch": 1.988794425811819, "grad_norm": 1.792170763015747, "learning_rate": 5.375670989546533e-06, "loss": 0.8958, "step": 4780 }, { "epoch": 1.9892104100044197, "grad_norm": 1.7165770530700684, "learning_rate": 5.371698968675829e-06, "loss": 0.8037, "step": 4781 }, { "epoch": 1.9896263941970205, "grad_norm": 1.8736475706100464, "learning_rate": 5.3677278768517714e-06, "loss": 0.8961, "step": 4782 }, { "epoch": 1.9900423783896213, "grad_norm": 1.7963590621948242, "learning_rate": 5.363757714871492e-06, "loss": 0.8575, "step": 4783 }, { "epoch": 1.9904583625822219, "grad_norm": 1.7295472621917725, "learning_rate": 5.359788483531918e-06, "loss": 0.7854, "step": 4784 }, { "epoch": 1.9908743467748224, "grad_norm": 1.838823676109314, "learning_rate": 5.355820183629811e-06, "loss": 0.886, "step": 4785 }, { "epoch": 1.9912903309674232, "grad_norm": 35.48053741455078, "learning_rate": 5.351852815961728e-06, "loss": 0.9153, "step": 4786 }, { "epoch": 1.991706315160024, "grad_norm": 233.60121154785156, "learning_rate": 5.347886381324047e-06, "loss": 0.8108, "step": 4787 }, { "epoch": 1.9921222993526246, "grad_norm": 795.7938842773438, "learning_rate": 5.343920880512953e-06, "loss": 0.7946, "step": 4788 }, { "epoch": 1.9925382835452252, "grad_norm": 1.767920732498169, "learning_rate": 5.3399563143244545e-06, "loss": 0.7736, "step": 4789 }, { "epoch": 1.9929542677378258, "grad_norm": 1.8356196880340576, "learning_rate": 5.3359926835543586e-06, "loss": 0.7621, "step": 4790 }, { "epoch": 1.9933702519304266, "grad_norm": 1.8646554946899414, "learning_rate": 5.332029988998295e-06, "loss": 0.8826, "step": 4791 }, { "epoch": 1.9937862361230274, "grad_norm": 1.990957498550415, "learning_rate": 5.328068231451706e-06, "loss": 0.8493, "step": 4792 }, { "epoch": 1.994202220315628, "grad_norm": 1.9867106676101685, "learning_rate": 5.324107411709836e-06, "loss": 0.8318, "step": 4793 }, { "epoch": 1.9946182045082286, "grad_norm": 1.8152321577072144, "learning_rate": 5.320147530567752e-06, "loss": 0.7249, "step": 4794 }, { "epoch": 1.9950341887008294, "grad_norm": 1.7512741088867188, "learning_rate": 5.316188588820323e-06, "loss": 0.7405, "step": 4795 }, { "epoch": 1.9954501728934302, "grad_norm": 1.7320574522018433, "learning_rate": 5.312230587262238e-06, "loss": 0.7348, "step": 4796 }, { "epoch": 1.9958661570860308, "grad_norm": 234.1103973388672, "learning_rate": 5.308273526687986e-06, "loss": 0.819, "step": 4797 }, { "epoch": 1.9962821412786314, "grad_norm": 1.7250163555145264, "learning_rate": 5.3043174078918845e-06, "loss": 0.7843, "step": 4798 }, { "epoch": 1.996698125471232, "grad_norm": 1.8048146963119507, "learning_rate": 5.300362231668042e-06, "loss": 0.8859, "step": 4799 }, { "epoch": 1.9971141096638327, "grad_norm": 1.9349174499511719, "learning_rate": 5.296407998810398e-06, "loss": 0.8198, "step": 4800 }, { "epoch": 1.9975300938564335, "grad_norm": 1.9306910037994385, "learning_rate": 5.2924547101126825e-06, "loss": 0.8408, "step": 4801 }, { "epoch": 1.9979460780490341, "grad_norm": 1.9036942720413208, "learning_rate": 5.288502366368453e-06, "loss": 0.8862, "step": 4802 }, { "epoch": 1.9983620622416347, "grad_norm": 302.9581298828125, "learning_rate": 5.2845509683710625e-06, "loss": 0.767, "step": 4803 }, { "epoch": 1.9987780464342355, "grad_norm": 1.9159168004989624, "learning_rate": 5.280600516913692e-06, "loss": 0.906, "step": 4804 }, { "epoch": 1.9991940306268363, "grad_norm": 1.8269288539886475, "learning_rate": 5.276651012789313e-06, "loss": 0.8531, "step": 4805 }, { "epoch": 1.999610014819437, "grad_norm": 1.7595394849777222, "learning_rate": 5.272702456790724e-06, "loss": 0.8057, "step": 4806 }, { "epoch": 2.0, "grad_norm": 2.065281391143799, "learning_rate": 5.26875484971052e-06, "loss": 0.7668, "step": 4807 }, { "epoch": 2.0004159841926006, "grad_norm": 1.758406162261963, "learning_rate": 5.2648081923411095e-06, "loss": 0.8376, "step": 4808 }, { "epoch": 2.000831968385201, "grad_norm": 1.889032006263733, "learning_rate": 5.260862485474718e-06, "loss": 0.8185, "step": 4809 }, { "epoch": 2.001247952577802, "grad_norm": 1.9136682748794556, "learning_rate": 5.2569177299033656e-06, "loss": 0.7454, "step": 4810 }, { "epoch": 2.0016639367704028, "grad_norm": 1.8723700046539307, "learning_rate": 5.2529739264188985e-06, "loss": 0.7385, "step": 4811 }, { "epoch": 2.0020799209630034, "grad_norm": 1.9210467338562012, "learning_rate": 5.249031075812954e-06, "loss": 0.9233, "step": 4812 }, { "epoch": 2.002495905155604, "grad_norm": 1.8237929344177246, "learning_rate": 5.245089178876996e-06, "loss": 0.7466, "step": 4813 }, { "epoch": 2.002911889348205, "grad_norm": 1.6920336484909058, "learning_rate": 5.24114823640228e-06, "loss": 0.9118, "step": 4814 }, { "epoch": 2.0033278735408055, "grad_norm": 1.7584619522094727, "learning_rate": 5.237208249179886e-06, "loss": 0.6917, "step": 4815 }, { "epoch": 2.003743857733406, "grad_norm": 1.9156886339187622, "learning_rate": 5.233269218000691e-06, "loss": 0.7959, "step": 4816 }, { "epoch": 2.0041598419260067, "grad_norm": 1.860249638557434, "learning_rate": 5.229331143655381e-06, "loss": 0.8801, "step": 4817 }, { "epoch": 2.0045758261186073, "grad_norm": 1.891375184059143, "learning_rate": 5.22539402693445e-06, "loss": 0.7694, "step": 4818 }, { "epoch": 2.0049918103112083, "grad_norm": 2.3143868446350098, "learning_rate": 5.221457868628211e-06, "loss": 0.7545, "step": 4819 }, { "epoch": 2.005407794503809, "grad_norm": 1.9926867485046387, "learning_rate": 5.217522669526767e-06, "loss": 0.6908, "step": 4820 }, { "epoch": 2.0058237786964095, "grad_norm": 2.7702338695526123, "learning_rate": 5.213588430420039e-06, "loss": 0.8019, "step": 4821 }, { "epoch": 2.00623976288901, "grad_norm": 1.8141357898712158, "learning_rate": 5.20965515209776e-06, "loss": 0.8147, "step": 4822 }, { "epoch": 2.006655747081611, "grad_norm": 1.823941707611084, "learning_rate": 5.205722835349455e-06, "loss": 0.7381, "step": 4823 }, { "epoch": 2.0070717312742117, "grad_norm": 1.8951081037521362, "learning_rate": 5.201791480964471e-06, "loss": 0.8389, "step": 4824 }, { "epoch": 2.0074877154668123, "grad_norm": 1.9125561714172363, "learning_rate": 5.197861089731955e-06, "loss": 0.7504, "step": 4825 }, { "epoch": 2.007903699659413, "grad_norm": 1.9381814002990723, "learning_rate": 5.193931662440856e-06, "loss": 0.8718, "step": 4826 }, { "epoch": 2.0083196838520134, "grad_norm": 1.8549882173538208, "learning_rate": 5.190003199879935e-06, "loss": 0.7652, "step": 4827 }, { "epoch": 2.0087356680446145, "grad_norm": 1.6929489374160767, "learning_rate": 5.186075702837765e-06, "loss": 0.8456, "step": 4828 }, { "epoch": 2.009151652237215, "grad_norm": 1.7201263904571533, "learning_rate": 5.18214917210271e-06, "loss": 0.7351, "step": 4829 }, { "epoch": 2.0095676364298156, "grad_norm": 1.9090911149978638, "learning_rate": 5.17822360846296e-06, "loss": 0.822, "step": 4830 }, { "epoch": 2.009983620622416, "grad_norm": 16.161123275756836, "learning_rate": 5.17429901270649e-06, "loss": 0.6928, "step": 4831 }, { "epoch": 2.0103996048150172, "grad_norm": 1.7311233282089233, "learning_rate": 5.170375385621098e-06, "loss": 0.7683, "step": 4832 }, { "epoch": 2.010815589007618, "grad_norm": 1.9265320301055908, "learning_rate": 5.166452727994373e-06, "loss": 0.7151, "step": 4833 }, { "epoch": 2.0112315732002184, "grad_norm": 1.7491929531097412, "learning_rate": 5.162531040613725e-06, "loss": 0.8744, "step": 4834 }, { "epoch": 2.011647557392819, "grad_norm": 1.7753816843032837, "learning_rate": 5.158610324266358e-06, "loss": 0.7949, "step": 4835 }, { "epoch": 2.0120635415854196, "grad_norm": 1.9726346731185913, "learning_rate": 5.154690579739279e-06, "loss": 0.8273, "step": 4836 }, { "epoch": 2.0124795257780206, "grad_norm": 1.7295159101486206, "learning_rate": 5.150771807819313e-06, "loss": 0.8, "step": 4837 }, { "epoch": 2.012895509970621, "grad_norm": 1.7551915645599365, "learning_rate": 5.146854009293072e-06, "loss": 0.8148, "step": 4838 }, { "epoch": 2.0133114941632217, "grad_norm": 1.885811448097229, "learning_rate": 5.142937184946992e-06, "loss": 0.815, "step": 4839 }, { "epoch": 2.0137274783558223, "grad_norm": 1.8496283292770386, "learning_rate": 5.1390213355672955e-06, "loss": 0.8147, "step": 4840 }, { "epoch": 2.0141434625484234, "grad_norm": 1.684757947921753, "learning_rate": 5.135106461940027e-06, "loss": 0.6964, "step": 4841 }, { "epoch": 2.014559446741024, "grad_norm": 1.7538949251174927, "learning_rate": 5.1311925648510155e-06, "loss": 0.679, "step": 4842 }, { "epoch": 2.0149754309336245, "grad_norm": 1.8244366645812988, "learning_rate": 5.127279645085912e-06, "loss": 0.8678, "step": 4843 }, { "epoch": 2.015391415126225, "grad_norm": 1.8514437675476074, "learning_rate": 5.123367703430156e-06, "loss": 0.7537, "step": 4844 }, { "epoch": 2.0158073993188257, "grad_norm": 1.714751958847046, "learning_rate": 5.119456740669006e-06, "loss": 0.8827, "step": 4845 }, { "epoch": 2.0162233835114267, "grad_norm": 1.7206995487213135, "learning_rate": 5.1155467575875136e-06, "loss": 0.8043, "step": 4846 }, { "epoch": 2.0166393677040273, "grad_norm": 1.8682092428207397, "learning_rate": 5.111637754970535e-06, "loss": 0.7993, "step": 4847 }, { "epoch": 2.017055351896628, "grad_norm": 3.9224343299865723, "learning_rate": 5.107729733602729e-06, "loss": 0.7483, "step": 4848 }, { "epoch": 2.0174713360892285, "grad_norm": 2.026002883911133, "learning_rate": 5.103822694268563e-06, "loss": 0.7408, "step": 4849 }, { "epoch": 2.0178873202818295, "grad_norm": 1.9368709325790405, "learning_rate": 5.099916637752298e-06, "loss": 0.7233, "step": 4850 }, { "epoch": 2.01830330447443, "grad_norm": 1.8379569053649902, "learning_rate": 5.096011564838008e-06, "loss": 0.7835, "step": 4851 }, { "epoch": 2.0187192886670307, "grad_norm": 1.8302550315856934, "learning_rate": 5.092107476309568e-06, "loss": 0.8787, "step": 4852 }, { "epoch": 2.0191352728596312, "grad_norm": 1.6893017292022705, "learning_rate": 5.088204372950645e-06, "loss": 0.8342, "step": 4853 }, { "epoch": 2.019551257052232, "grad_norm": 1.8088067770004272, "learning_rate": 5.084302255544722e-06, "loss": 0.7205, "step": 4854 }, { "epoch": 2.019967241244833, "grad_norm": 1.8100413084030151, "learning_rate": 5.080401124875074e-06, "loss": 0.7023, "step": 4855 }, { "epoch": 2.0203832254374334, "grad_norm": 1.8851842880249023, "learning_rate": 5.076500981724782e-06, "loss": 0.7999, "step": 4856 }, { "epoch": 2.020799209630034, "grad_norm": 33.38238525390625, "learning_rate": 5.072601826876725e-06, "loss": 0.8015, "step": 4857 }, { "epoch": 2.0212151938226346, "grad_norm": 1.8180267810821533, "learning_rate": 5.068703661113595e-06, "loss": 0.8449, "step": 4858 }, { "epoch": 2.0216311780152356, "grad_norm": 1.8155304193496704, "learning_rate": 5.064806485217867e-06, "loss": 0.804, "step": 4859 }, { "epoch": 2.022047162207836, "grad_norm": 1.733863115310669, "learning_rate": 5.060910299971838e-06, "loss": 0.8467, "step": 4860 }, { "epoch": 2.022463146400437, "grad_norm": 1.9677066802978516, "learning_rate": 5.057015106157587e-06, "loss": 0.8192, "step": 4861 }, { "epoch": 2.0228791305930374, "grad_norm": 1.8826665878295898, "learning_rate": 5.053120904557009e-06, "loss": 0.8439, "step": 4862 }, { "epoch": 2.023295114785638, "grad_norm": 1.8488376140594482, "learning_rate": 5.049227695951791e-06, "loss": 0.8568, "step": 4863 }, { "epoch": 2.023711098978239, "grad_norm": 1.9075261354446411, "learning_rate": 5.0453354811234255e-06, "loss": 0.8526, "step": 4864 }, { "epoch": 2.0241270831708396, "grad_norm": 1.7241277694702148, "learning_rate": 5.041444260853201e-06, "loss": 0.7502, "step": 4865 }, { "epoch": 2.02454306736344, "grad_norm": 1.8549505472183228, "learning_rate": 5.037554035922207e-06, "loss": 0.8515, "step": 4866 }, { "epoch": 2.0249590515560407, "grad_norm": 1.7797895669937134, "learning_rate": 5.0336648071113405e-06, "loss": 0.6406, "step": 4867 }, { "epoch": 2.0253750357486417, "grad_norm": 1.744124412536621, "learning_rate": 5.029776575201286e-06, "loss": 0.6412, "step": 4868 }, { "epoch": 2.0257910199412423, "grad_norm": 1.998699426651001, "learning_rate": 5.0258893409725415e-06, "loss": 0.8345, "step": 4869 }, { "epoch": 2.026207004133843, "grad_norm": 1.7937201261520386, "learning_rate": 5.022003105205392e-06, "loss": 0.8513, "step": 4870 }, { "epoch": 2.0266229883264435, "grad_norm": 1.7055786848068237, "learning_rate": 5.018117868679935e-06, "loss": 0.7117, "step": 4871 }, { "epoch": 2.027038972519044, "grad_norm": 1.819511890411377, "learning_rate": 5.0142336321760535e-06, "loss": 0.7785, "step": 4872 }, { "epoch": 2.027454956711645, "grad_norm": 1.8506033420562744, "learning_rate": 5.010350396473443e-06, "loss": 0.8632, "step": 4873 }, { "epoch": 2.0278709409042457, "grad_norm": 1.9408166408538818, "learning_rate": 5.006468162351588e-06, "loss": 0.7837, "step": 4874 }, { "epoch": 2.0282869250968463, "grad_norm": 1.983962059020996, "learning_rate": 5.002586930589779e-06, "loss": 0.7758, "step": 4875 }, { "epoch": 2.028702909289447, "grad_norm": 1.8252509832382202, "learning_rate": 4.9987067019670945e-06, "loss": 0.8422, "step": 4876 }, { "epoch": 2.029118893482048, "grad_norm": 2.381131649017334, "learning_rate": 4.994827477262429e-06, "loss": 0.7989, "step": 4877 }, { "epoch": 2.0295348776746485, "grad_norm": 10.382339477539062, "learning_rate": 4.990949257254458e-06, "loss": 0.83, "step": 4878 }, { "epoch": 2.029950861867249, "grad_norm": 1.983427882194519, "learning_rate": 4.987072042721671e-06, "loss": 0.8528, "step": 4879 }, { "epoch": 2.0303668460598496, "grad_norm": 2.014322519302368, "learning_rate": 4.98319583444234e-06, "loss": 0.8727, "step": 4880 }, { "epoch": 2.03078283025245, "grad_norm": 1.7403373718261719, "learning_rate": 4.979320633194546e-06, "loss": 0.7762, "step": 4881 }, { "epoch": 2.0311988144450512, "grad_norm": 1.8215535879135132, "learning_rate": 4.9754464397561694e-06, "loss": 0.7572, "step": 4882 }, { "epoch": 2.031614798637652, "grad_norm": 151.0079803466797, "learning_rate": 4.971573254904879e-06, "loss": 0.7659, "step": 4883 }, { "epoch": 2.0320307828302524, "grad_norm": 1.739895224571228, "learning_rate": 4.967701079418141e-06, "loss": 0.767, "step": 4884 }, { "epoch": 2.032446767022853, "grad_norm": 1.905136227607727, "learning_rate": 4.963829914073234e-06, "loss": 0.9325, "step": 4885 }, { "epoch": 2.032862751215454, "grad_norm": 1.9031330347061157, "learning_rate": 4.959959759647217e-06, "loss": 0.865, "step": 4886 }, { "epoch": 2.0332787354080546, "grad_norm": 1.75641930103302, "learning_rate": 4.956090616916951e-06, "loss": 0.7791, "step": 4887 }, { "epoch": 2.033694719600655, "grad_norm": 1.9306412935256958, "learning_rate": 4.952222486659102e-06, "loss": 0.843, "step": 4888 }, { "epoch": 2.0341107037932558, "grad_norm": 69.55744934082031, "learning_rate": 4.948355369650118e-06, "loss": 0.6864, "step": 4889 }, { "epoch": 2.0345266879858563, "grad_norm": 1.691346287727356, "learning_rate": 4.9444892666662605e-06, "loss": 0.7341, "step": 4890 }, { "epoch": 2.0349426721784574, "grad_norm": 1.9483604431152344, "learning_rate": 4.940624178483572e-06, "loss": 0.8462, "step": 4891 }, { "epoch": 2.035358656371058, "grad_norm": 1.819884181022644, "learning_rate": 4.936760105877903e-06, "loss": 0.8126, "step": 4892 }, { "epoch": 2.0357746405636585, "grad_norm": 1.974323034286499, "learning_rate": 4.9328970496248905e-06, "loss": 0.8379, "step": 4893 }, { "epoch": 2.036190624756259, "grad_norm": 1.8630046844482422, "learning_rate": 4.929035010499979e-06, "loss": 0.8218, "step": 4894 }, { "epoch": 2.03660660894886, "grad_norm": 1.8640927076339722, "learning_rate": 4.925173989278399e-06, "loss": 0.8476, "step": 4895 }, { "epoch": 2.0370225931414607, "grad_norm": 1.745063066482544, "learning_rate": 4.921313986735173e-06, "loss": 0.8963, "step": 4896 }, { "epoch": 2.0374385773340613, "grad_norm": 1.7822388410568237, "learning_rate": 4.917455003645137e-06, "loss": 0.7226, "step": 4897 }, { "epoch": 2.037854561526662, "grad_norm": 1.8025821447372437, "learning_rate": 4.9135970407829025e-06, "loss": 0.9105, "step": 4898 }, { "epoch": 2.0382705457192625, "grad_norm": 1.6022781133651733, "learning_rate": 4.909740098922892e-06, "loss": 0.7664, "step": 4899 }, { "epoch": 2.0386865299118635, "grad_norm": 1.8243920803070068, "learning_rate": 4.905884178839307e-06, "loss": 0.7938, "step": 4900 }, { "epoch": 2.039102514104464, "grad_norm": 1.810836911201477, "learning_rate": 4.902029281306163e-06, "loss": 0.842, "step": 4901 }, { "epoch": 2.0395184982970647, "grad_norm": 1.799210786819458, "learning_rate": 4.898175407097252e-06, "loss": 0.7954, "step": 4902 }, { "epoch": 2.0399344824896652, "grad_norm": 1.8605595827102661, "learning_rate": 4.894322556986174e-06, "loss": 0.8667, "step": 4903 }, { "epoch": 2.0403504666822663, "grad_norm": 1.9353276491165161, "learning_rate": 4.890470731746316e-06, "loss": 0.8789, "step": 4904 }, { "epoch": 2.040766450874867, "grad_norm": 1.7940411567687988, "learning_rate": 4.886619932150862e-06, "loss": 0.7438, "step": 4905 }, { "epoch": 2.0411824350674674, "grad_norm": 1.9553931951522827, "learning_rate": 4.882770158972785e-06, "loss": 0.8231, "step": 4906 }, { "epoch": 2.041598419260068, "grad_norm": 1.8635525703430176, "learning_rate": 4.878921412984864e-06, "loss": 0.8053, "step": 4907 }, { "epoch": 2.0420144034526686, "grad_norm": 102.97711944580078, "learning_rate": 4.875073694959656e-06, "loss": 0.7966, "step": 4908 }, { "epoch": 2.0424303876452696, "grad_norm": 1.707582712173462, "learning_rate": 4.871227005669528e-06, "loss": 0.7721, "step": 4909 }, { "epoch": 2.04284637183787, "grad_norm": 1.8138344287872314, "learning_rate": 4.8673813458866256e-06, "loss": 0.7651, "step": 4910 }, { "epoch": 2.043262356030471, "grad_norm": 1.726593255996704, "learning_rate": 4.8635367163829e-06, "loss": 0.826, "step": 4911 }, { "epoch": 2.0436783402230714, "grad_norm": 1.8251041173934937, "learning_rate": 4.859693117930087e-06, "loss": 0.8465, "step": 4912 }, { "epoch": 2.0440943244156724, "grad_norm": 1.6730724573135376, "learning_rate": 4.85585055129972e-06, "loss": 0.8232, "step": 4913 }, { "epoch": 2.044510308608273, "grad_norm": 1.9338738918304443, "learning_rate": 4.852009017263125e-06, "loss": 0.7734, "step": 4914 }, { "epoch": 2.0449262928008736, "grad_norm": 1.882011890411377, "learning_rate": 4.848168516591414e-06, "loss": 0.8356, "step": 4915 }, { "epoch": 2.045342276993474, "grad_norm": 1.9149208068847656, "learning_rate": 4.844329050055504e-06, "loss": 0.8585, "step": 4916 }, { "epoch": 2.0457582611860747, "grad_norm": 1.8485207557678223, "learning_rate": 4.840490618426092e-06, "loss": 0.8162, "step": 4917 }, { "epoch": 2.0461742453786758, "grad_norm": 15.269914627075195, "learning_rate": 4.8366532224736775e-06, "loss": 0.8033, "step": 4918 }, { "epoch": 2.0465902295712763, "grad_norm": 1.8067677021026611, "learning_rate": 4.8328168629685425e-06, "loss": 0.9355, "step": 4919 }, { "epoch": 2.047006213763877, "grad_norm": 1.9174190759658813, "learning_rate": 4.8289815406807725e-06, "loss": 0.8169, "step": 4920 }, { "epoch": 2.0474221979564775, "grad_norm": 1.9439277648925781, "learning_rate": 4.825147256380231e-06, "loss": 0.8071, "step": 4921 }, { "epoch": 2.0478381821490785, "grad_norm": 1.7919059991836548, "learning_rate": 4.821314010836586e-06, "loss": 0.6974, "step": 4922 }, { "epoch": 2.048254166341679, "grad_norm": 1.764557957649231, "learning_rate": 4.817481804819287e-06, "loss": 0.9124, "step": 4923 }, { "epoch": 2.0486701505342797, "grad_norm": 1.9454810619354248, "learning_rate": 4.8136506390975845e-06, "loss": 0.8037, "step": 4924 }, { "epoch": 2.0490861347268803, "grad_norm": 1.96073317527771, "learning_rate": 4.809820514440512e-06, "loss": 0.8585, "step": 4925 }, { "epoch": 2.049502118919481, "grad_norm": 1.8418480157852173, "learning_rate": 4.805991431616897e-06, "loss": 0.7575, "step": 4926 }, { "epoch": 2.049918103112082, "grad_norm": 1.9206056594848633, "learning_rate": 4.802163391395351e-06, "loss": 0.8347, "step": 4927 }, { "epoch": 2.0503340873046825, "grad_norm": 1.804085612297058, "learning_rate": 4.7983363945442915e-06, "loss": 0.7252, "step": 4928 }, { "epoch": 2.050750071497283, "grad_norm": 51.6650505065918, "learning_rate": 4.794510441831919e-06, "loss": 0.7935, "step": 4929 }, { "epoch": 2.0511660556898836, "grad_norm": 1.7871228456497192, "learning_rate": 4.790685534026216e-06, "loss": 0.7858, "step": 4930 }, { "epoch": 2.0515820398824847, "grad_norm": 1.8526036739349365, "learning_rate": 4.78686167189497e-06, "loss": 0.7798, "step": 4931 }, { "epoch": 2.0519980240750852, "grad_norm": 1.908791184425354, "learning_rate": 4.783038856205745e-06, "loss": 0.868, "step": 4932 }, { "epoch": 2.052414008267686, "grad_norm": 1.7492172718048096, "learning_rate": 4.779217087725908e-06, "loss": 0.8497, "step": 4933 }, { "epoch": 2.0528299924602864, "grad_norm": 1.9801454544067383, "learning_rate": 4.7753963672226054e-06, "loss": 0.9043, "step": 4934 }, { "epoch": 2.053245976652887, "grad_norm": 1.8427273035049438, "learning_rate": 4.7715766954627765e-06, "loss": 0.8606, "step": 4935 }, { "epoch": 2.053661960845488, "grad_norm": 1.694335699081421, "learning_rate": 4.767758073213147e-06, "loss": 0.783, "step": 4936 }, { "epoch": 2.0540779450380886, "grad_norm": 1.8055343627929688, "learning_rate": 4.763940501240242e-06, "loss": 0.8486, "step": 4937 }, { "epoch": 2.054493929230689, "grad_norm": 1.8910034894943237, "learning_rate": 4.760123980310364e-06, "loss": 0.7872, "step": 4938 }, { "epoch": 2.0549099134232898, "grad_norm": 1.7700109481811523, "learning_rate": 4.756308511189615e-06, "loss": 0.8188, "step": 4939 }, { "epoch": 2.055325897615891, "grad_norm": 1.6288197040557861, "learning_rate": 4.752494094643873e-06, "loss": 0.8021, "step": 4940 }, { "epoch": 2.0557418818084914, "grad_norm": 1.7885332107543945, "learning_rate": 4.7486807314388194e-06, "loss": 0.77, "step": 4941 }, { "epoch": 2.056157866001092, "grad_norm": 1.779032826423645, "learning_rate": 4.744868422339912e-06, "loss": 0.7271, "step": 4942 }, { "epoch": 2.0565738501936925, "grad_norm": 2.0400052070617676, "learning_rate": 4.741057168112409e-06, "loss": 0.9221, "step": 4943 }, { "epoch": 2.056989834386293, "grad_norm": 6.320093631744385, "learning_rate": 4.737246969521343e-06, "loss": 0.7978, "step": 4944 }, { "epoch": 2.057405818578894, "grad_norm": 1.808891773223877, "learning_rate": 4.733437827331541e-06, "loss": 0.7473, "step": 4945 }, { "epoch": 2.0578218027714947, "grad_norm": 1.6560642719268799, "learning_rate": 4.729629742307625e-06, "loss": 0.752, "step": 4946 }, { "epoch": 2.0582377869640953, "grad_norm": 1.9215320348739624, "learning_rate": 4.72582271521399e-06, "loss": 0.9068, "step": 4947 }, { "epoch": 2.058653771156696, "grad_norm": 1.795438528060913, "learning_rate": 4.722016746814836e-06, "loss": 0.7988, "step": 4948 }, { "epoch": 2.059069755349297, "grad_norm": 1.9849523305892944, "learning_rate": 4.7182118378741325e-06, "loss": 0.8244, "step": 4949 }, { "epoch": 2.0594857395418975, "grad_norm": 2.14764404296875, "learning_rate": 4.714407989155655e-06, "loss": 0.8017, "step": 4950 }, { "epoch": 2.059901723734498, "grad_norm": 1.8035629987716675, "learning_rate": 4.7106052014229455e-06, "loss": 0.8331, "step": 4951 }, { "epoch": 2.0603177079270987, "grad_norm": 1.9199823141098022, "learning_rate": 4.706803475439355e-06, "loss": 0.8404, "step": 4952 }, { "epoch": 2.0607336921196993, "grad_norm": 1.936385989189148, "learning_rate": 4.7030028119680035e-06, "loss": 0.7823, "step": 4953 }, { "epoch": 2.0611496763123003, "grad_norm": 1.856269359588623, "learning_rate": 4.699203211771807e-06, "loss": 0.8869, "step": 4954 }, { "epoch": 2.061565660504901, "grad_norm": 1.8242759704589844, "learning_rate": 4.695404675613461e-06, "loss": 0.7667, "step": 4955 }, { "epoch": 2.0619816446975014, "grad_norm": 1.8780790567398071, "learning_rate": 4.691607204255459e-06, "loss": 0.8007, "step": 4956 }, { "epoch": 2.062397628890102, "grad_norm": 1.8306574821472168, "learning_rate": 4.687810798460067e-06, "loss": 0.754, "step": 4957 }, { "epoch": 2.062813613082703, "grad_norm": 2.0762827396392822, "learning_rate": 4.684015458989346e-06, "loss": 0.764, "step": 4958 }, { "epoch": 2.0632295972753036, "grad_norm": 1.773998498916626, "learning_rate": 4.6802211866051475e-06, "loss": 0.7551, "step": 4959 }, { "epoch": 2.063645581467904, "grad_norm": 2.0061023235321045, "learning_rate": 4.676427982069094e-06, "loss": 0.8707, "step": 4960 }, { "epoch": 2.064061565660505, "grad_norm": 1.9967437982559204, "learning_rate": 4.672635846142607e-06, "loss": 0.8943, "step": 4961 }, { "epoch": 2.0644775498531054, "grad_norm": 1.9161067008972168, "learning_rate": 4.668844779586886e-06, "loss": 0.8201, "step": 4962 }, { "epoch": 2.0648935340457064, "grad_norm": 178.76148986816406, "learning_rate": 4.665054783162917e-06, "loss": 0.769, "step": 4963 }, { "epoch": 2.065309518238307, "grad_norm": 1.7465217113494873, "learning_rate": 4.661265857631475e-06, "loss": 0.8816, "step": 4964 }, { "epoch": 2.0657255024309076, "grad_norm": 1.677586555480957, "learning_rate": 4.657478003753117e-06, "loss": 0.7833, "step": 4965 }, { "epoch": 2.066141486623508, "grad_norm": 1.9064850807189941, "learning_rate": 4.653691222288181e-06, "loss": 0.7888, "step": 4966 }, { "epoch": 2.066557470816109, "grad_norm": 1.8471615314483643, "learning_rate": 4.6499055139967995e-06, "loss": 0.6959, "step": 4967 }, { "epoch": 2.0669734550087098, "grad_norm": 1.6653889417648315, "learning_rate": 4.64612087963888e-06, "loss": 0.7567, "step": 4968 }, { "epoch": 2.0673894392013104, "grad_norm": 1.8261184692382812, "learning_rate": 4.642337319974124e-06, "loss": 0.7985, "step": 4969 }, { "epoch": 2.067805423393911, "grad_norm": 2.00927472114563, "learning_rate": 4.638554835762003e-06, "loss": 0.8948, "step": 4970 }, { "epoch": 2.0682214075865115, "grad_norm": 2.06836199760437, "learning_rate": 4.634773427761792e-06, "loss": 0.859, "step": 4971 }, { "epoch": 2.0686373917791125, "grad_norm": 1.842974066734314, "learning_rate": 4.63099309673253e-06, "loss": 0.8892, "step": 4972 }, { "epoch": 2.069053375971713, "grad_norm": 1.7665369510650635, "learning_rate": 4.627213843433057e-06, "loss": 0.7218, "step": 4973 }, { "epoch": 2.0694693601643137, "grad_norm": 1.7517274618148804, "learning_rate": 4.623435668621985e-06, "loss": 0.7662, "step": 4974 }, { "epoch": 2.0698853443569143, "grad_norm": 1.8104051351547241, "learning_rate": 4.61965857305771e-06, "loss": 0.8068, "step": 4975 }, { "epoch": 2.0703013285495153, "grad_norm": 1.8750758171081543, "learning_rate": 4.615882557498422e-06, "loss": 0.6973, "step": 4976 }, { "epoch": 2.070717312742116, "grad_norm": 1.9295477867126465, "learning_rate": 4.612107622702079e-06, "loss": 0.8319, "step": 4977 }, { "epoch": 2.0711332969347165, "grad_norm": 1.7910254001617432, "learning_rate": 4.6083337694264375e-06, "loss": 0.7636, "step": 4978 }, { "epoch": 2.071549281127317, "grad_norm": 1.7487869262695312, "learning_rate": 4.604560998429023e-06, "loss": 0.845, "step": 4979 }, { "epoch": 2.0719652653199176, "grad_norm": 1.809360384941101, "learning_rate": 4.600789310467158e-06, "loss": 0.8438, "step": 4980 }, { "epoch": 2.0723812495125187, "grad_norm": 1.8254700899124146, "learning_rate": 4.5970187062979295e-06, "loss": 0.7354, "step": 4981 }, { "epoch": 2.0727972337051193, "grad_norm": 4.366148948669434, "learning_rate": 4.5932491866782284e-06, "loss": 0.8433, "step": 4982 }, { "epoch": 2.07321321789772, "grad_norm": 1.8298795223236084, "learning_rate": 4.58948075236471e-06, "loss": 0.8152, "step": 4983 }, { "epoch": 2.0736292020903204, "grad_norm": 1.8739724159240723, "learning_rate": 4.58571340411382e-06, "loss": 0.7787, "step": 4984 }, { "epoch": 2.0740451862829214, "grad_norm": 1.9054057598114014, "learning_rate": 4.5819471426817805e-06, "loss": 0.7911, "step": 4985 }, { "epoch": 2.074461170475522, "grad_norm": 85.04871368408203, "learning_rate": 4.578181968824606e-06, "loss": 0.7932, "step": 4986 }, { "epoch": 2.0748771546681226, "grad_norm": 2.1544854640960693, "learning_rate": 4.5744178832980814e-06, "loss": 0.9349, "step": 4987 }, { "epoch": 2.075293138860723, "grad_norm": 1.8016419410705566, "learning_rate": 4.570654886857779e-06, "loss": 0.7148, "step": 4988 }, { "epoch": 2.075709123053324, "grad_norm": 1.9001600742340088, "learning_rate": 4.566892980259055e-06, "loss": 0.8474, "step": 4989 }, { "epoch": 2.076125107245925, "grad_norm": 1.657288908958435, "learning_rate": 4.563132164257037e-06, "loss": 0.7195, "step": 4990 }, { "epoch": 2.0765410914385254, "grad_norm": 1.8500107526779175, "learning_rate": 4.559372439606647e-06, "loss": 0.7999, "step": 4991 }, { "epoch": 2.076957075631126, "grad_norm": 1.7640396356582642, "learning_rate": 4.555613807062578e-06, "loss": 0.8357, "step": 4992 }, { "epoch": 2.0773730598237266, "grad_norm": 1.9789553880691528, "learning_rate": 4.551856267379305e-06, "loss": 0.7413, "step": 4993 }, { "epoch": 2.0777890440163276, "grad_norm": 2.0592546463012695, "learning_rate": 4.548099821311084e-06, "loss": 0.7507, "step": 4994 }, { "epoch": 2.078205028208928, "grad_norm": 1.842075228691101, "learning_rate": 4.544344469611957e-06, "loss": 0.7192, "step": 4995 }, { "epoch": 2.0786210124015287, "grad_norm": 1.932060956954956, "learning_rate": 4.540590213035737e-06, "loss": 0.8838, "step": 4996 }, { "epoch": 2.0790369965941293, "grad_norm": 1.9019811153411865, "learning_rate": 4.536837052336029e-06, "loss": 0.7665, "step": 4997 }, { "epoch": 2.07945298078673, "grad_norm": 1.9650578498840332, "learning_rate": 4.533084988266203e-06, "loss": 0.8046, "step": 4998 }, { "epoch": 2.079868964979331, "grad_norm": 1.82059907913208, "learning_rate": 4.529334021579426e-06, "loss": 0.8402, "step": 4999 }, { "epoch": 2.0802849491719315, "grad_norm": 1.831721305847168, "learning_rate": 4.525584153028626e-06, "loss": 0.8661, "step": 5000 }, { "epoch": 2.0802849491719315, "eval_loss": 0.7588615417480469, "eval_runtime": 2160.6176, "eval_samples_per_second": 3.051, "eval_steps_per_second": 1.525, "step": 5000 }, { "epoch": 2.0802849491719315, "grad_norm": 1.894078016281128, "learning_rate": 4.521835383366532e-06, "loss": 0.8434, "step": 5001 }, { "epoch": 2.080700933364532, "grad_norm": 1.9454823732376099, "learning_rate": 4.518087713345629e-06, "loss": 0.8822, "step": 5002 }, { "epoch": 2.0811169175571327, "grad_norm": 4.073822498321533, "learning_rate": 4.514341143718203e-06, "loss": 0.7135, "step": 5003 }, { "epoch": 2.0815329017497337, "grad_norm": 1.7793787717819214, "learning_rate": 4.510595675236305e-06, "loss": 0.718, "step": 5004 }, { "epoch": 2.0819488859423343, "grad_norm": 1.8068472146987915, "learning_rate": 4.506851308651766e-06, "loss": 0.7418, "step": 5005 }, { "epoch": 2.082364870134935, "grad_norm": 1.9212234020233154, "learning_rate": 4.503108044716205e-06, "loss": 0.8893, "step": 5006 }, { "epoch": 2.0827808543275355, "grad_norm": 1.9705758094787598, "learning_rate": 4.499365884181008e-06, "loss": 0.7822, "step": 5007 }, { "epoch": 2.083196838520136, "grad_norm": 1.7853929996490479, "learning_rate": 4.495624827797352e-06, "loss": 0.7746, "step": 5008 }, { "epoch": 2.083612822712737, "grad_norm": 1.8122934103012085, "learning_rate": 4.491884876316177e-06, "loss": 0.7727, "step": 5009 }, { "epoch": 2.0840288069053376, "grad_norm": 1.8586748838424683, "learning_rate": 4.4881460304882185e-06, "loss": 0.831, "step": 5010 }, { "epoch": 2.0844447910979382, "grad_norm": 1.7463696002960205, "learning_rate": 4.484408291063973e-06, "loss": 0.78, "step": 5011 }, { "epoch": 2.084860775290539, "grad_norm": 2.6781153678894043, "learning_rate": 4.480671658793731e-06, "loss": 0.733, "step": 5012 }, { "epoch": 2.08527675948314, "grad_norm": 2.335439682006836, "learning_rate": 4.4769361344275495e-06, "loss": 0.8119, "step": 5013 }, { "epoch": 2.0856927436757404, "grad_norm": 1.754643440246582, "learning_rate": 4.473201718715268e-06, "loss": 0.8257, "step": 5014 }, { "epoch": 2.086108727868341, "grad_norm": 1.802445888519287, "learning_rate": 4.469468412406495e-06, "loss": 0.8193, "step": 5015 }, { "epoch": 2.0865247120609416, "grad_norm": 1.8837392330169678, "learning_rate": 4.465736216250632e-06, "loss": 0.7805, "step": 5016 }, { "epoch": 2.086940696253542, "grad_norm": 1.660852074623108, "learning_rate": 4.462005130996843e-06, "loss": 0.6433, "step": 5017 }, { "epoch": 2.087356680446143, "grad_norm": 1.932102084159851, "learning_rate": 4.4582751573940776e-06, "loss": 0.8142, "step": 5018 }, { "epoch": 2.087772664638744, "grad_norm": 1.8134044408798218, "learning_rate": 4.454546296191063e-06, "loss": 0.7687, "step": 5019 }, { "epoch": 2.0881886488313444, "grad_norm": 1.7391412258148193, "learning_rate": 4.450818548136295e-06, "loss": 0.8876, "step": 5020 }, { "epoch": 2.088604633023945, "grad_norm": 1.8051446676254272, "learning_rate": 4.4470919139780545e-06, "loss": 0.7274, "step": 5021 }, { "epoch": 2.089020617216546, "grad_norm": 1.9296422004699707, "learning_rate": 4.443366394464394e-06, "loss": 0.7292, "step": 5022 }, { "epoch": 2.0894366014091466, "grad_norm": 1.9714815616607666, "learning_rate": 4.4396419903431405e-06, "loss": 0.7871, "step": 5023 }, { "epoch": 2.089852585601747, "grad_norm": 8.46558952331543, "learning_rate": 4.4359187023618995e-06, "loss": 0.7931, "step": 5024 }, { "epoch": 2.0902685697943477, "grad_norm": 2.0120298862457275, "learning_rate": 4.4321965312680596e-06, "loss": 0.8393, "step": 5025 }, { "epoch": 2.0906845539869483, "grad_norm": 1.781286358833313, "learning_rate": 4.428475477808768e-06, "loss": 0.8043, "step": 5026 }, { "epoch": 2.0911005381795493, "grad_norm": 1.9219900369644165, "learning_rate": 4.4247555427309685e-06, "loss": 0.7903, "step": 5027 }, { "epoch": 2.09151652237215, "grad_norm": 2.0185110569000244, "learning_rate": 4.421036726781362e-06, "loss": 0.8199, "step": 5028 }, { "epoch": 2.0919325065647505, "grad_norm": 1.7881356477737427, "learning_rate": 4.41731903070644e-06, "loss": 0.8354, "step": 5029 }, { "epoch": 2.092348490757351, "grad_norm": 2.057147264480591, "learning_rate": 4.413602455252453e-06, "loss": 0.7644, "step": 5030 }, { "epoch": 2.092764474949952, "grad_norm": 1.9302979707717896, "learning_rate": 4.409887001165445e-06, "loss": 0.8076, "step": 5031 }, { "epoch": 2.0931804591425527, "grad_norm": 200.54983520507812, "learning_rate": 4.4061726691912195e-06, "loss": 0.8107, "step": 5032 }, { "epoch": 2.0935964433351533, "grad_norm": 1.8127899169921875, "learning_rate": 4.402459460075364e-06, "loss": 0.7627, "step": 5033 }, { "epoch": 2.094012427527754, "grad_norm": 1.6828124523162842, "learning_rate": 4.3987473745632295e-06, "loss": 0.7512, "step": 5034 }, { "epoch": 2.0944284117203544, "grad_norm": 1.931667685508728, "learning_rate": 4.395036413399955e-06, "loss": 0.8128, "step": 5035 }, { "epoch": 2.0948443959129555, "grad_norm": 25.1763916015625, "learning_rate": 4.391326577330451e-06, "loss": 0.8051, "step": 5036 }, { "epoch": 2.095260380105556, "grad_norm": 1.9229930639266968, "learning_rate": 4.387617867099393e-06, "loss": 0.8366, "step": 5037 }, { "epoch": 2.0956763642981566, "grad_norm": 1.8567529916763306, "learning_rate": 4.383910283451242e-06, "loss": 0.8991, "step": 5038 }, { "epoch": 2.096092348490757, "grad_norm": 2.153883934020996, "learning_rate": 4.3802038271302224e-06, "loss": 0.9149, "step": 5039 }, { "epoch": 2.0965083326833582, "grad_norm": 188.21693420410156, "learning_rate": 4.376498498880344e-06, "loss": 0.8255, "step": 5040 }, { "epoch": 2.096924316875959, "grad_norm": 1.814893364906311, "learning_rate": 4.372794299445379e-06, "loss": 0.7863, "step": 5041 }, { "epoch": 2.0973403010685594, "grad_norm": 1.8846505880355835, "learning_rate": 4.369091229568874e-06, "loss": 0.8378, "step": 5042 }, { "epoch": 2.09775628526116, "grad_norm": 2.1024045944213867, "learning_rate": 4.365389289994159e-06, "loss": 0.7903, "step": 5043 }, { "epoch": 2.0981722694537606, "grad_norm": 1.8105390071868896, "learning_rate": 4.36168848146433e-06, "loss": 0.8083, "step": 5044 }, { "epoch": 2.0985882536463616, "grad_norm": 2.054717779159546, "learning_rate": 4.357988804722251e-06, "loss": 0.7725, "step": 5045 }, { "epoch": 2.099004237838962, "grad_norm": 2.0094926357269287, "learning_rate": 4.354290260510571e-06, "loss": 0.858, "step": 5046 }, { "epoch": 2.0994202220315628, "grad_norm": 1.9250659942626953, "learning_rate": 4.350592849571697e-06, "loss": 0.8307, "step": 5047 }, { "epoch": 2.0998362062241633, "grad_norm": 1.6291232109069824, "learning_rate": 4.346896572647827e-06, "loss": 0.6806, "step": 5048 }, { "epoch": 2.1002521904167644, "grad_norm": 2.000936508178711, "learning_rate": 4.343201430480911e-06, "loss": 0.9145, "step": 5049 }, { "epoch": 2.100668174609365, "grad_norm": 1.9010308980941772, "learning_rate": 4.339507423812686e-06, "loss": 0.7161, "step": 5050 }, { "epoch": 2.1010841588019655, "grad_norm": 1.8490790128707886, "learning_rate": 4.335814553384659e-06, "loss": 0.898, "step": 5051 }, { "epoch": 2.101500142994566, "grad_norm": 1.7349729537963867, "learning_rate": 4.332122819938103e-06, "loss": 0.6666, "step": 5052 }, { "epoch": 2.1019161271871667, "grad_norm": 1.8651318550109863, "learning_rate": 4.328432224214064e-06, "loss": 0.946, "step": 5053 }, { "epoch": 2.1023321113797677, "grad_norm": 1.94442880153656, "learning_rate": 4.324742766953362e-06, "loss": 0.8019, "step": 5054 }, { "epoch": 2.1027480955723683, "grad_norm": 1.735630989074707, "learning_rate": 4.32105444889659e-06, "loss": 0.8426, "step": 5055 }, { "epoch": 2.103164079764969, "grad_norm": 710.2088623046875, "learning_rate": 4.317367270784107e-06, "loss": 0.7794, "step": 5056 }, { "epoch": 2.1035800639575695, "grad_norm": 1.8936560153961182, "learning_rate": 4.313681233356053e-06, "loss": 0.8046, "step": 5057 }, { "epoch": 2.1039960481501705, "grad_norm": 1.7091258764266968, "learning_rate": 4.309996337352323e-06, "loss": 0.7354, "step": 5058 }, { "epoch": 2.104412032342771, "grad_norm": 52.589698791503906, "learning_rate": 4.306312583512603e-06, "loss": 0.7981, "step": 5059 }, { "epoch": 2.1048280165353717, "grad_norm": 1.9433382749557495, "learning_rate": 4.302629972576329e-06, "loss": 0.8375, "step": 5060 }, { "epoch": 2.1052440007279722, "grad_norm": 1.7990106344223022, "learning_rate": 4.298948505282727e-06, "loss": 0.748, "step": 5061 }, { "epoch": 2.105659984920573, "grad_norm": 1.826115369796753, "learning_rate": 4.295268182370779e-06, "loss": 0.8136, "step": 5062 }, { "epoch": 2.106075969113174, "grad_norm": 1.6706900596618652, "learning_rate": 4.291589004579242e-06, "loss": 0.6443, "step": 5063 }, { "epoch": 2.1064919533057744, "grad_norm": 1.7182568311691284, "learning_rate": 4.2879109726466415e-06, "loss": 0.7654, "step": 5064 }, { "epoch": 2.106907937498375, "grad_norm": 1.9131877422332764, "learning_rate": 4.284234087311279e-06, "loss": 0.7391, "step": 5065 }, { "epoch": 2.1073239216909756, "grad_norm": 2.0994136333465576, "learning_rate": 4.280558349311223e-06, "loss": 0.8012, "step": 5066 }, { "epoch": 2.1077399058835766, "grad_norm": 1.860122561454773, "learning_rate": 4.276883759384306e-06, "loss": 0.7487, "step": 5067 }, { "epoch": 2.108155890076177, "grad_norm": 1.8769432306289673, "learning_rate": 4.2732103182681405e-06, "loss": 0.9335, "step": 5068 }, { "epoch": 2.108571874268778, "grad_norm": 1.8276311159133911, "learning_rate": 4.269538026700095e-06, "loss": 0.8352, "step": 5069 }, { "epoch": 2.1089878584613784, "grad_norm": 1.9279171228408813, "learning_rate": 4.265866885417323e-06, "loss": 0.8493, "step": 5070 }, { "epoch": 2.109403842653979, "grad_norm": 1.9935239553451538, "learning_rate": 4.262196895156735e-06, "loss": 0.8108, "step": 5071 }, { "epoch": 2.10981982684658, "grad_norm": 1.8287417888641357, "learning_rate": 4.258528056655013e-06, "loss": 0.6964, "step": 5072 }, { "epoch": 2.1102358110391806, "grad_norm": 54.39466857910156, "learning_rate": 4.2548603706486084e-06, "loss": 0.7551, "step": 5073 }, { "epoch": 2.110651795231781, "grad_norm": 1.7955002784729004, "learning_rate": 4.251193837873746e-06, "loss": 0.8128, "step": 5074 }, { "epoch": 2.1110677794243817, "grad_norm": 1.7993510961532593, "learning_rate": 4.247528459066409e-06, "loss": 0.7603, "step": 5075 }, { "epoch": 2.1114837636169828, "grad_norm": 1.8499023914337158, "learning_rate": 4.243864234962364e-06, "loss": 0.7876, "step": 5076 }, { "epoch": 2.1118997478095833, "grad_norm": 1.840472936630249, "learning_rate": 4.240201166297126e-06, "loss": 0.8246, "step": 5077 }, { "epoch": 2.112315732002184, "grad_norm": 1.8088995218276978, "learning_rate": 4.236539253806e-06, "loss": 0.7516, "step": 5078 }, { "epoch": 2.1127317161947845, "grad_norm": 1.845316767692566, "learning_rate": 4.232878498224039e-06, "loss": 0.8785, "step": 5079 }, { "epoch": 2.113147700387385, "grad_norm": 1.7848037481307983, "learning_rate": 4.229218900286078e-06, "loss": 0.8212, "step": 5080 }, { "epoch": 2.113563684579986, "grad_norm": 1.8152996301651, "learning_rate": 4.2255604607267144e-06, "loss": 0.7724, "step": 5081 }, { "epoch": 2.1139796687725867, "grad_norm": 1.8771520853042603, "learning_rate": 4.221903180280306e-06, "loss": 0.7839, "step": 5082 }, { "epoch": 2.1143956529651873, "grad_norm": 1.8195772171020508, "learning_rate": 4.218247059680994e-06, "loss": 0.6934, "step": 5083 }, { "epoch": 2.114811637157788, "grad_norm": 2.450012445449829, "learning_rate": 4.214592099662671e-06, "loss": 0.7985, "step": 5084 }, { "epoch": 2.115227621350389, "grad_norm": 1.9272167682647705, "learning_rate": 4.2109383009590075e-06, "loss": 0.8435, "step": 5085 }, { "epoch": 2.1156436055429895, "grad_norm": 2.074758291244507, "learning_rate": 4.207285664303432e-06, "loss": 0.7802, "step": 5086 }, { "epoch": 2.11605958973559, "grad_norm": 2.973963737487793, "learning_rate": 4.203634190429151e-06, "loss": 0.771, "step": 5087 }, { "epoch": 2.1164755739281906, "grad_norm": 1.7752830982208252, "learning_rate": 4.199983880069124e-06, "loss": 0.7929, "step": 5088 }, { "epoch": 2.116891558120791, "grad_norm": 1.9166699647903442, "learning_rate": 4.19633473395609e-06, "loss": 0.7162, "step": 5089 }, { "epoch": 2.1173075423133922, "grad_norm": 1.9367256164550781, "learning_rate": 4.192686752822543e-06, "loss": 0.7478, "step": 5090 }, { "epoch": 2.117723526505993, "grad_norm": 1.7046153545379639, "learning_rate": 4.189039937400753e-06, "loss": 0.6977, "step": 5091 }, { "epoch": 2.1181395106985934, "grad_norm": 1.9997949600219727, "learning_rate": 4.185394288422749e-06, "loss": 0.8056, "step": 5092 }, { "epoch": 2.118555494891194, "grad_norm": 1.8246110677719116, "learning_rate": 4.18174980662033e-06, "loss": 0.7755, "step": 5093 }, { "epoch": 2.118971479083795, "grad_norm": 1.918913722038269, "learning_rate": 4.178106492725053e-06, "loss": 0.7659, "step": 5094 }, { "epoch": 2.1193874632763956, "grad_norm": 1.9636502265930176, "learning_rate": 4.174464347468251e-06, "loss": 0.7652, "step": 5095 }, { "epoch": 2.119803447468996, "grad_norm": 1.8266685009002686, "learning_rate": 4.170823371581022e-06, "loss": 0.7483, "step": 5096 }, { "epoch": 2.1202194316615968, "grad_norm": 1.9878679513931274, "learning_rate": 4.167183565794217e-06, "loss": 0.8527, "step": 5097 }, { "epoch": 2.1206354158541973, "grad_norm": 1.9876009225845337, "learning_rate": 4.163544930838468e-06, "loss": 0.7047, "step": 5098 }, { "epoch": 2.1210514000467984, "grad_norm": 1.9613198041915894, "learning_rate": 4.159907467444158e-06, "loss": 0.9577, "step": 5099 }, { "epoch": 2.121467384239399, "grad_norm": 2.0205588340759277, "learning_rate": 4.1562711763414466e-06, "loss": 0.7899, "step": 5100 }, { "epoch": 2.1218833684319995, "grad_norm": 1.7096972465515137, "learning_rate": 4.15263605826025e-06, "loss": 0.7777, "step": 5101 }, { "epoch": 2.1222993526246, "grad_norm": 10.250446319580078, "learning_rate": 4.149002113930251e-06, "loss": 0.8036, "step": 5102 }, { "epoch": 2.122715336817201, "grad_norm": 1.8540146350860596, "learning_rate": 4.145369344080896e-06, "loss": 0.7565, "step": 5103 }, { "epoch": 2.1231313210098017, "grad_norm": 1.7656886577606201, "learning_rate": 4.141737749441402e-06, "loss": 0.8518, "step": 5104 }, { "epoch": 2.1235473052024023, "grad_norm": 1.772105097770691, "learning_rate": 4.1381073307407375e-06, "loss": 0.8979, "step": 5105 }, { "epoch": 2.123963289395003, "grad_norm": 36.614070892333984, "learning_rate": 4.13447808870765e-06, "loss": 0.8108, "step": 5106 }, { "epoch": 2.1243792735876035, "grad_norm": 1.8096210956573486, "learning_rate": 4.130850024070638e-06, "loss": 0.8005, "step": 5107 }, { "epoch": 2.1247952577802045, "grad_norm": 1.8919752836227417, "learning_rate": 4.127223137557975e-06, "loss": 0.7829, "step": 5108 }, { "epoch": 2.125211241972805, "grad_norm": 1.917353630065918, "learning_rate": 4.123597429897684e-06, "loss": 0.8828, "step": 5109 }, { "epoch": 2.1256272261654057, "grad_norm": 1.931614875793457, "learning_rate": 4.119972901817568e-06, "loss": 0.7734, "step": 5110 }, { "epoch": 2.1260432103580063, "grad_norm": 1.9245749711990356, "learning_rate": 4.116349554045181e-06, "loss": 0.6826, "step": 5111 }, { "epoch": 2.1264591945506073, "grad_norm": 1.8689885139465332, "learning_rate": 4.112727387307839e-06, "loss": 0.8803, "step": 5112 }, { "epoch": 2.126875178743208, "grad_norm": 2.088702917098999, "learning_rate": 4.1091064023326335e-06, "loss": 0.9407, "step": 5113 }, { "epoch": 2.1272911629358084, "grad_norm": 1.72379469871521, "learning_rate": 4.105486599846404e-06, "loss": 0.7686, "step": 5114 }, { "epoch": 2.127707147128409, "grad_norm": 3.8762807846069336, "learning_rate": 4.101867980575765e-06, "loss": 0.7248, "step": 5115 }, { "epoch": 2.1281231313210096, "grad_norm": 1.7709624767303467, "learning_rate": 4.098250545247082e-06, "loss": 0.779, "step": 5116 }, { "epoch": 2.1285391155136106, "grad_norm": 1.892422080039978, "learning_rate": 4.094634294586497e-06, "loss": 0.832, "step": 5117 }, { "epoch": 2.128955099706211, "grad_norm": 1.7961821556091309, "learning_rate": 4.091019229319898e-06, "loss": 0.8588, "step": 5118 }, { "epoch": 2.129371083898812, "grad_norm": 1.7361931800842285, "learning_rate": 4.087405350172951e-06, "loss": 0.761, "step": 5119 }, { "epoch": 2.1297870680914124, "grad_norm": 1.8613386154174805, "learning_rate": 4.083792657871071e-06, "loss": 0.725, "step": 5120 }, { "epoch": 2.1302030522840134, "grad_norm": 1.9339286088943481, "learning_rate": 4.080181153139438e-06, "loss": 0.6607, "step": 5121 }, { "epoch": 2.130619036476614, "grad_norm": 1.8179134130477905, "learning_rate": 4.076570836703e-06, "loss": 0.8893, "step": 5122 }, { "epoch": 2.1310350206692146, "grad_norm": 1.7797014713287354, "learning_rate": 4.072961709286461e-06, "loss": 0.7494, "step": 5123 }, { "epoch": 2.131451004861815, "grad_norm": 9.235833168029785, "learning_rate": 4.069353771614283e-06, "loss": 0.8943, "step": 5124 }, { "epoch": 2.1318669890544157, "grad_norm": 1.7317312955856323, "learning_rate": 4.065747024410696e-06, "loss": 0.8102, "step": 5125 }, { "epoch": 2.1322829732470168, "grad_norm": 1.814066767692566, "learning_rate": 4.062141468399692e-06, "loss": 0.7125, "step": 5126 }, { "epoch": 2.1326989574396173, "grad_norm": 1.8200135231018066, "learning_rate": 4.058537104305015e-06, "loss": 0.7682, "step": 5127 }, { "epoch": 2.133114941632218, "grad_norm": 1.943495750427246, "learning_rate": 4.05493393285018e-06, "loss": 0.8339, "step": 5128 }, { "epoch": 2.1335309258248185, "grad_norm": 1.9392668008804321, "learning_rate": 4.051331954758452e-06, "loss": 0.7896, "step": 5129 }, { "epoch": 2.1339469100174195, "grad_norm": 1.8559105396270752, "learning_rate": 4.047731170752869e-06, "loss": 0.7265, "step": 5130 }, { "epoch": 2.13436289421002, "grad_norm": 1.7445679903030396, "learning_rate": 4.044131581556218e-06, "loss": 0.8247, "step": 5131 }, { "epoch": 2.1347788784026207, "grad_norm": 1.758561611175537, "learning_rate": 4.0405331878910515e-06, "loss": 0.7475, "step": 5132 }, { "epoch": 2.1351948625952213, "grad_norm": 1.9571279287338257, "learning_rate": 4.036935990479677e-06, "loss": 0.8034, "step": 5133 }, { "epoch": 2.135610846787822, "grad_norm": 1.9500726461410522, "learning_rate": 4.033339990044174e-06, "loss": 0.8805, "step": 5134 }, { "epoch": 2.136026830980423, "grad_norm": 1.9098986387252808, "learning_rate": 4.029745187306365e-06, "loss": 0.8441, "step": 5135 }, { "epoch": 2.1364428151730235, "grad_norm": 1.939675211906433, "learning_rate": 4.026151582987849e-06, "loss": 0.9434, "step": 5136 }, { "epoch": 2.136858799365624, "grad_norm": 1.9044522047042847, "learning_rate": 4.02255917780997e-06, "loss": 0.773, "step": 5137 }, { "epoch": 2.1372747835582246, "grad_norm": 1.6926710605621338, "learning_rate": 4.0189679724938425e-06, "loss": 0.7092, "step": 5138 }, { "epoch": 2.1376907677508257, "grad_norm": 1.8681764602661133, "learning_rate": 4.01537796776033e-06, "loss": 0.8755, "step": 5139 }, { "epoch": 2.1381067519434263, "grad_norm": 1.8795115947723389, "learning_rate": 4.011789164330066e-06, "loss": 0.9106, "step": 5140 }, { "epoch": 2.138522736136027, "grad_norm": 8.56825065612793, "learning_rate": 4.008201562923435e-06, "loss": 0.8396, "step": 5141 }, { "epoch": 2.1389387203286274, "grad_norm": 1.8327733278274536, "learning_rate": 4.004615164260577e-06, "loss": 0.8021, "step": 5142 }, { "epoch": 2.139354704521228, "grad_norm": 1.8744086027145386, "learning_rate": 4.001029969061403e-06, "loss": 0.7339, "step": 5143 }, { "epoch": 2.139770688713829, "grad_norm": 1.657740831375122, "learning_rate": 3.99744597804557e-06, "loss": 0.7026, "step": 5144 }, { "epoch": 2.1401866729064296, "grad_norm": 2.0130372047424316, "learning_rate": 3.993863191932504e-06, "loss": 0.8313, "step": 5145 }, { "epoch": 2.14060265709903, "grad_norm": 1.8196885585784912, "learning_rate": 3.990281611441377e-06, "loss": 0.7768, "step": 5146 }, { "epoch": 2.1410186412916308, "grad_norm": 2.1790883541107178, "learning_rate": 3.986701237291132e-06, "loss": 0.8301, "step": 5147 }, { "epoch": 2.141434625484232, "grad_norm": 1.8790643215179443, "learning_rate": 3.983122070200458e-06, "loss": 0.6877, "step": 5148 }, { "epoch": 2.1418506096768324, "grad_norm": 1.7526428699493408, "learning_rate": 3.979544110887814e-06, "loss": 0.8216, "step": 5149 }, { "epoch": 2.142266593869433, "grad_norm": 1.688186526298523, "learning_rate": 3.975967360071406e-06, "loss": 0.7363, "step": 5150 }, { "epoch": 2.1426825780620335, "grad_norm": 1.9024873971939087, "learning_rate": 3.972391818469199e-06, "loss": 0.7587, "step": 5151 }, { "epoch": 2.143098562254634, "grad_norm": 2.140409231185913, "learning_rate": 3.968817486798918e-06, "loss": 0.7938, "step": 5152 }, { "epoch": 2.143514546447235, "grad_norm": 1.7516093254089355, "learning_rate": 3.9652443657780485e-06, "loss": 0.7599, "step": 5153 }, { "epoch": 2.1439305306398357, "grad_norm": 2.9205851554870605, "learning_rate": 3.961672456123824e-06, "loss": 0.758, "step": 5154 }, { "epoch": 2.1443465148324363, "grad_norm": 2.0624818801879883, "learning_rate": 3.958101758553246e-06, "loss": 0.743, "step": 5155 }, { "epoch": 2.144762499025037, "grad_norm": 1.777345895767212, "learning_rate": 3.95453227378306e-06, "loss": 0.698, "step": 5156 }, { "epoch": 2.145178483217638, "grad_norm": 1.938413381576538, "learning_rate": 3.950964002529777e-06, "loss": 0.9905, "step": 5157 }, { "epoch": 2.1455944674102385, "grad_norm": 1.869236946105957, "learning_rate": 3.9473969455096664e-06, "loss": 0.7725, "step": 5158 }, { "epoch": 2.146010451602839, "grad_norm": 1.9583253860473633, "learning_rate": 3.943831103438746e-06, "loss": 0.8236, "step": 5159 }, { "epoch": 2.1464264357954397, "grad_norm": 1.8504104614257812, "learning_rate": 3.940266477032795e-06, "loss": 0.8377, "step": 5160 }, { "epoch": 2.1468424199880403, "grad_norm": 1.9147312641143799, "learning_rate": 3.936703067007341e-06, "loss": 0.7791, "step": 5161 }, { "epoch": 2.1472584041806413, "grad_norm": 2.077589988708496, "learning_rate": 3.933140874077681e-06, "loss": 0.6769, "step": 5162 }, { "epoch": 2.147674388373242, "grad_norm": 1.725881814956665, "learning_rate": 3.929579898958854e-06, "loss": 0.733, "step": 5163 }, { "epoch": 2.1480903725658425, "grad_norm": 2.0878067016601562, "learning_rate": 3.926020142365666e-06, "loss": 0.9103, "step": 5164 }, { "epoch": 2.148506356758443, "grad_norm": 1.8650578260421753, "learning_rate": 3.922461605012666e-06, "loss": 0.8182, "step": 5165 }, { "epoch": 2.148922340951044, "grad_norm": 1.780759334564209, "learning_rate": 3.9189042876141724e-06, "loss": 0.7624, "step": 5166 }, { "epoch": 2.1493383251436446, "grad_norm": 1.7255797386169434, "learning_rate": 3.915348190884246e-06, "loss": 0.7799, "step": 5167 }, { "epoch": 2.1497543093362452, "grad_norm": 1.986055850982666, "learning_rate": 3.911793315536714e-06, "loss": 0.7896, "step": 5168 }, { "epoch": 2.150170293528846, "grad_norm": 1.783389687538147, "learning_rate": 3.908239662285144e-06, "loss": 0.8145, "step": 5169 }, { "epoch": 2.1505862777214464, "grad_norm": 1.982088565826416, "learning_rate": 3.904687231842877e-06, "loss": 0.7805, "step": 5170 }, { "epoch": 2.1510022619140474, "grad_norm": 1.855636477470398, "learning_rate": 3.9011360249229915e-06, "loss": 0.8471, "step": 5171 }, { "epoch": 2.151418246106648, "grad_norm": 1.814117670059204, "learning_rate": 3.897586042238326e-06, "loss": 0.8174, "step": 5172 }, { "epoch": 2.1518342302992486, "grad_norm": 1.8129929304122925, "learning_rate": 3.8940372845014796e-06, "loss": 0.7538, "step": 5173 }, { "epoch": 2.152250214491849, "grad_norm": 1.7587685585021973, "learning_rate": 3.890489752424797e-06, "loss": 0.8257, "step": 5174 }, { "epoch": 2.15266619868445, "grad_norm": 26.319948196411133, "learning_rate": 3.886943446720382e-06, "loss": 0.7931, "step": 5175 }, { "epoch": 2.1530821828770508, "grad_norm": 1.8143326044082642, "learning_rate": 3.883398368100089e-06, "loss": 0.7116, "step": 5176 }, { "epoch": 2.1534981670696514, "grad_norm": 1.9083313941955566, "learning_rate": 3.87985451727553e-06, "loss": 0.8458, "step": 5177 }, { "epoch": 2.153914151262252, "grad_norm": 1.8416543006896973, "learning_rate": 3.8763118949580635e-06, "loss": 0.7333, "step": 5178 }, { "epoch": 2.1543301354548525, "grad_norm": 1.749293565750122, "learning_rate": 3.8727705018588135e-06, "loss": 0.8005, "step": 5179 }, { "epoch": 2.1547461196474536, "grad_norm": 1.7966147661209106, "learning_rate": 3.8692303386886444e-06, "loss": 0.7784, "step": 5180 }, { "epoch": 2.155162103840054, "grad_norm": 1.6724828481674194, "learning_rate": 3.8656914061581805e-06, "loss": 0.7594, "step": 5181 }, { "epoch": 2.1555780880326547, "grad_norm": 1.703696370124817, "learning_rate": 3.862153704977794e-06, "loss": 0.7591, "step": 5182 }, { "epoch": 2.1559940722252553, "grad_norm": 1.7709914445877075, "learning_rate": 3.85861723585762e-06, "loss": 0.8647, "step": 5183 }, { "epoch": 2.1564100564178563, "grad_norm": 1.8331094980239868, "learning_rate": 3.855081999507534e-06, "loss": 0.7668, "step": 5184 }, { "epoch": 2.156826040610457, "grad_norm": 1.804863691329956, "learning_rate": 3.851547996637176e-06, "loss": 0.8707, "step": 5185 }, { "epoch": 2.1572420248030575, "grad_norm": 1.8653391599655151, "learning_rate": 3.8480152279559255e-06, "loss": 0.7582, "step": 5186 }, { "epoch": 2.157658008995658, "grad_norm": 1.7672803401947021, "learning_rate": 3.844483694172925e-06, "loss": 0.8764, "step": 5187 }, { "epoch": 2.1580739931882587, "grad_norm": 1.8026220798492432, "learning_rate": 3.840953395997068e-06, "loss": 0.9089, "step": 5188 }, { "epoch": 2.1584899773808597, "grad_norm": 1.8810008764266968, "learning_rate": 3.837424334136994e-06, "loss": 0.826, "step": 5189 }, { "epoch": 2.1589059615734603, "grad_norm": 2.820917844772339, "learning_rate": 3.833896509301097e-06, "loss": 0.7535, "step": 5190 }, { "epoch": 2.159321945766061, "grad_norm": 1.9464613199234009, "learning_rate": 3.830369922197522e-06, "loss": 0.9213, "step": 5191 }, { "epoch": 2.1597379299586614, "grad_norm": 1.9269062280654907, "learning_rate": 3.8268445735341705e-06, "loss": 0.7976, "step": 5192 }, { "epoch": 2.1601539141512625, "grad_norm": 1.7822346687316895, "learning_rate": 3.823320464018687e-06, "loss": 0.7631, "step": 5193 }, { "epoch": 2.160569898343863, "grad_norm": 1.8196943998336792, "learning_rate": 3.8197975943584785e-06, "loss": 0.8761, "step": 5194 }, { "epoch": 2.1609858825364636, "grad_norm": 1.8513599634170532, "learning_rate": 3.816275965260688e-06, "loss": 0.715, "step": 5195 }, { "epoch": 2.161401866729064, "grad_norm": 1.8034054040908813, "learning_rate": 3.8127555774322276e-06, "loss": 0.7479, "step": 5196 }, { "epoch": 2.161817850921665, "grad_norm": 1.756510853767395, "learning_rate": 3.8092364315797424e-06, "loss": 0.7713, "step": 5197 }, { "epoch": 2.162233835114266, "grad_norm": 2.414656639099121, "learning_rate": 3.805718528409643e-06, "loss": 0.7504, "step": 5198 }, { "epoch": 2.1626498193068664, "grad_norm": 1.811490535736084, "learning_rate": 3.8022018686280802e-06, "loss": 0.7994, "step": 5199 }, { "epoch": 2.163065803499467, "grad_norm": 1.9351102113723755, "learning_rate": 3.798686452940957e-06, "loss": 0.8702, "step": 5200 }, { "epoch": 2.1634817876920676, "grad_norm": 1.8632456064224243, "learning_rate": 3.795172282053935e-06, "loss": 0.7421, "step": 5201 }, { "epoch": 2.1638977718846686, "grad_norm": 1.7915116548538208, "learning_rate": 3.7916593566724124e-06, "loss": 0.7356, "step": 5202 }, { "epoch": 2.164313756077269, "grad_norm": 2.1657755374908447, "learning_rate": 3.7881476775015513e-06, "loss": 0.8433, "step": 5203 }, { "epoch": 2.1647297402698698, "grad_norm": 1.7881276607513428, "learning_rate": 3.784637245246251e-06, "loss": 0.8537, "step": 5204 }, { "epoch": 2.1651457244624703, "grad_norm": 1.8066843748092651, "learning_rate": 3.7811280606111735e-06, "loss": 0.7884, "step": 5205 }, { "epoch": 2.165561708655071, "grad_norm": 71.2354736328125, "learning_rate": 3.7776201243007147e-06, "loss": 0.8497, "step": 5206 }, { "epoch": 2.165977692847672, "grad_norm": 32.97038650512695, "learning_rate": 3.774113437019037e-06, "loss": 0.7213, "step": 5207 }, { "epoch": 2.1663936770402725, "grad_norm": 1.9848618507385254, "learning_rate": 3.7706079994700362e-06, "loss": 0.7914, "step": 5208 }, { "epoch": 2.166809661232873, "grad_norm": 1.887885570526123, "learning_rate": 3.7671038123573723e-06, "loss": 0.7719, "step": 5209 }, { "epoch": 2.1672256454254737, "grad_norm": 1.7967970371246338, "learning_rate": 3.7636008763844425e-06, "loss": 0.7856, "step": 5210 }, { "epoch": 2.1676416296180747, "grad_norm": 1.784999132156372, "learning_rate": 3.7600991922543973e-06, "loss": 0.7501, "step": 5211 }, { "epoch": 2.1680576138106753, "grad_norm": 1.9869383573532104, "learning_rate": 3.7565987606701317e-06, "loss": 0.7743, "step": 5212 }, { "epoch": 2.168473598003276, "grad_norm": 1.7222195863723755, "learning_rate": 3.7530995823343e-06, "loss": 0.6579, "step": 5213 }, { "epoch": 2.1688895821958765, "grad_norm": 1.7829155921936035, "learning_rate": 3.7496016579492923e-06, "loss": 0.7598, "step": 5214 }, { "epoch": 2.169305566388477, "grad_norm": 1.8722178936004639, "learning_rate": 3.7461049882172583e-06, "loss": 0.8968, "step": 5215 }, { "epoch": 2.169721550581078, "grad_norm": 1.8277838230133057, "learning_rate": 3.7426095738400846e-06, "loss": 0.7433, "step": 5216 }, { "epoch": 2.1701375347736787, "grad_norm": 4.432876110076904, "learning_rate": 3.7391154155194155e-06, "loss": 0.7384, "step": 5217 }, { "epoch": 2.1705535189662792, "grad_norm": 2.0053656101226807, "learning_rate": 3.7356225139566417e-06, "loss": 0.8573, "step": 5218 }, { "epoch": 2.17096950315888, "grad_norm": 1.7346596717834473, "learning_rate": 3.732130869852897e-06, "loss": 0.871, "step": 5219 }, { "epoch": 2.171385487351481, "grad_norm": 1.86215341091156, "learning_rate": 3.728640483909063e-06, "loss": 0.8261, "step": 5220 }, { "epoch": 2.1718014715440814, "grad_norm": 1.9092551469802856, "learning_rate": 3.725151356825768e-06, "loss": 0.8872, "step": 5221 }, { "epoch": 2.172217455736682, "grad_norm": 1.8997044563293457, "learning_rate": 3.7216634893033986e-06, "loss": 0.7933, "step": 5222 }, { "epoch": 2.1726334399292826, "grad_norm": 1.793986201286316, "learning_rate": 3.7181768820420717e-06, "loss": 0.8212, "step": 5223 }, { "epoch": 2.173049424121883, "grad_norm": 1.6623166799545288, "learning_rate": 3.714691535741668e-06, "loss": 0.6595, "step": 5224 }, { "epoch": 2.173465408314484, "grad_norm": 1.8614861965179443, "learning_rate": 3.711207451101799e-06, "loss": 0.8046, "step": 5225 }, { "epoch": 2.173881392507085, "grad_norm": 1.9359936714172363, "learning_rate": 3.7077246288218384e-06, "loss": 0.8837, "step": 5226 }, { "epoch": 2.1742973766996854, "grad_norm": 1.8844236135482788, "learning_rate": 3.7042430696008923e-06, "loss": 0.8513, "step": 5227 }, { "epoch": 2.174713360892286, "grad_norm": 1.60140061378479, "learning_rate": 3.700762774137826e-06, "loss": 0.7874, "step": 5228 }, { "epoch": 2.175129345084887, "grad_norm": 2.008193254470825, "learning_rate": 3.6972837431312424e-06, "loss": 0.8144, "step": 5229 }, { "epoch": 2.1755453292774876, "grad_norm": 2.023996353149414, "learning_rate": 3.693805977279493e-06, "loss": 0.7709, "step": 5230 }, { "epoch": 2.175961313470088, "grad_norm": 1.8643540143966675, "learning_rate": 3.6903294772806717e-06, "loss": 0.893, "step": 5231 }, { "epoch": 2.1763772976626887, "grad_norm": 1.7590192556381226, "learning_rate": 3.686854243832627e-06, "loss": 0.8761, "step": 5232 }, { "epoch": 2.1767932818552893, "grad_norm": 3.801751136779785, "learning_rate": 3.6833802776329497e-06, "loss": 0.736, "step": 5233 }, { "epoch": 2.1772092660478903, "grad_norm": 1.7919310331344604, "learning_rate": 3.6799075793789695e-06, "loss": 0.8589, "step": 5234 }, { "epoch": 2.177625250240491, "grad_norm": 1.7929977178573608, "learning_rate": 3.6764361497677738e-06, "loss": 0.9434, "step": 5235 }, { "epoch": 2.1780412344330915, "grad_norm": 1.9977853298187256, "learning_rate": 3.6729659894961813e-06, "loss": 0.8622, "step": 5236 }, { "epoch": 2.178457218625692, "grad_norm": 1.9595190286636353, "learning_rate": 3.669497099260769e-06, "loss": 0.8344, "step": 5237 }, { "epoch": 2.178873202818293, "grad_norm": 29.644582748413086, "learning_rate": 3.666029479757851e-06, "loss": 0.7569, "step": 5238 }, { "epoch": 2.1792891870108937, "grad_norm": 2.0921053886413574, "learning_rate": 3.6625631316834887e-06, "loss": 0.9859, "step": 5239 }, { "epoch": 2.1797051712034943, "grad_norm": 1.854573130607605, "learning_rate": 3.659098055733483e-06, "loss": 0.8219, "step": 5240 }, { "epoch": 2.180121155396095, "grad_norm": 1.9048911333084106, "learning_rate": 3.6556342526033928e-06, "loss": 0.8172, "step": 5241 }, { "epoch": 2.1805371395886954, "grad_norm": 3.353949785232544, "learning_rate": 3.6521717229885057e-06, "loss": 0.7503, "step": 5242 }, { "epoch": 2.1809531237812965, "grad_norm": 1.9180938005447388, "learning_rate": 3.648710467583867e-06, "loss": 0.8291, "step": 5243 }, { "epoch": 2.181369107973897, "grad_norm": 1.9443389177322388, "learning_rate": 3.645250487084254e-06, "loss": 0.9857, "step": 5244 }, { "epoch": 2.1817850921664976, "grad_norm": 2.027536153793335, "learning_rate": 3.641791782184203e-06, "loss": 0.7092, "step": 5245 }, { "epoch": 2.182201076359098, "grad_norm": 1.7405189275741577, "learning_rate": 3.638334353577976e-06, "loss": 0.8129, "step": 5246 }, { "epoch": 2.1826170605516992, "grad_norm": 97.62860870361328, "learning_rate": 3.634878201959594e-06, "loss": 0.7515, "step": 5247 }, { "epoch": 2.1830330447443, "grad_norm": 1.7685939073562622, "learning_rate": 3.6314233280228182e-06, "loss": 0.7641, "step": 5248 }, { "epoch": 2.1834490289369004, "grad_norm": 1.9470746517181396, "learning_rate": 3.6279697324611494e-06, "loss": 0.8262, "step": 5249 }, { "epoch": 2.183865013129501, "grad_norm": 2.0681777000427246, "learning_rate": 3.624517415967832e-06, "loss": 0.8105, "step": 5250 }, { "epoch": 2.1842809973221016, "grad_norm": 2.2760274410247803, "learning_rate": 3.6210663792358526e-06, "loss": 0.8475, "step": 5251 }, { "epoch": 2.1846969815147026, "grad_norm": 1.7667756080627441, "learning_rate": 3.61761662295795e-06, "loss": 0.766, "step": 5252 }, { "epoch": 2.185112965707303, "grad_norm": 1.9273465871810913, "learning_rate": 3.6141681478265943e-06, "loss": 0.771, "step": 5253 }, { "epoch": 2.1855289498999038, "grad_norm": 1.8662246465682983, "learning_rate": 3.610720954534008e-06, "loss": 0.813, "step": 5254 }, { "epoch": 2.1859449340925043, "grad_norm": 1.9185336828231812, "learning_rate": 3.607275043772147e-06, "loss": 0.8049, "step": 5255 }, { "epoch": 2.1863609182851054, "grad_norm": 1.9351086616516113, "learning_rate": 3.603830416232722e-06, "loss": 0.7974, "step": 5256 }, { "epoch": 2.186776902477706, "grad_norm": 1.9092408418655396, "learning_rate": 3.6003870726071697e-06, "loss": 0.9356, "step": 5257 }, { "epoch": 2.1871928866703065, "grad_norm": 1.864372968673706, "learning_rate": 3.596945013586688e-06, "loss": 0.8679, "step": 5258 }, { "epoch": 2.187608870862907, "grad_norm": 1.7325552701950073, "learning_rate": 3.5935042398622023e-06, "loss": 0.7384, "step": 5259 }, { "epoch": 2.1880248550555077, "grad_norm": 1.6823514699935913, "learning_rate": 3.5900647521243836e-06, "loss": 0.7397, "step": 5260 }, { "epoch": 2.1884408392481087, "grad_norm": 1.8241446018218994, "learning_rate": 3.5866265510636456e-06, "loss": 0.8523, "step": 5261 }, { "epoch": 2.1888568234407093, "grad_norm": 1.7738444805145264, "learning_rate": 3.5831896373701448e-06, "loss": 0.6756, "step": 5262 }, { "epoch": 2.18927280763331, "grad_norm": 1.7680195569992065, "learning_rate": 3.5797540117337835e-06, "loss": 0.7837, "step": 5263 }, { "epoch": 2.1896887918259105, "grad_norm": 1.794722080230713, "learning_rate": 3.576319674844194e-06, "loss": 0.8496, "step": 5264 }, { "epoch": 2.1901047760185115, "grad_norm": 1.8150790929794312, "learning_rate": 3.5728866273907626e-06, "loss": 0.7618, "step": 5265 }, { "epoch": 2.190520760211112, "grad_norm": 1.687150239944458, "learning_rate": 3.5694548700626053e-06, "loss": 0.7385, "step": 5266 }, { "epoch": 2.1909367444037127, "grad_norm": 1.9648897647857666, "learning_rate": 3.5660244035485893e-06, "loss": 0.9042, "step": 5267 }, { "epoch": 2.1913527285963132, "grad_norm": 1.7448817491531372, "learning_rate": 3.5625952285373145e-06, "loss": 0.7683, "step": 5268 }, { "epoch": 2.191768712788914, "grad_norm": 1.8109543323516846, "learning_rate": 3.559167345717127e-06, "loss": 0.796, "step": 5269 }, { "epoch": 2.192184696981515, "grad_norm": 1.8710273504257202, "learning_rate": 3.555740755776107e-06, "loss": 0.8169, "step": 5270 }, { "epoch": 2.1926006811741154, "grad_norm": 1.820030927658081, "learning_rate": 3.552315459402086e-06, "loss": 0.7955, "step": 5271 }, { "epoch": 2.193016665366716, "grad_norm": 1.726975917816162, "learning_rate": 3.5488914572826225e-06, "loss": 0.7586, "step": 5272 }, { "epoch": 2.1934326495593166, "grad_norm": 1.7657256126403809, "learning_rate": 3.545468750105031e-06, "loss": 0.8848, "step": 5273 }, { "epoch": 2.1938486337519176, "grad_norm": 49.461246490478516, "learning_rate": 3.542047338556348e-06, "loss": 0.6923, "step": 5274 }, { "epoch": 2.194264617944518, "grad_norm": 2.1211869716644287, "learning_rate": 3.538627223323369e-06, "loss": 0.8041, "step": 5275 }, { "epoch": 2.194680602137119, "grad_norm": 1.7005524635314941, "learning_rate": 3.535208405092608e-06, "loss": 0.7694, "step": 5276 }, { "epoch": 2.1950965863297194, "grad_norm": 1.9312902688980103, "learning_rate": 3.531790884550341e-06, "loss": 0.8177, "step": 5277 }, { "epoch": 2.19551257052232, "grad_norm": 1.7196029424667358, "learning_rate": 3.528374662382569e-06, "loss": 0.8294, "step": 5278 }, { "epoch": 2.195928554714921, "grad_norm": 1.7035728693008423, "learning_rate": 3.52495973927503e-06, "loss": 0.8033, "step": 5279 }, { "epoch": 2.1963445389075216, "grad_norm": 1.8280093669891357, "learning_rate": 3.521546115913217e-06, "loss": 0.8162, "step": 5280 }, { "epoch": 2.196760523100122, "grad_norm": 1.928501844406128, "learning_rate": 3.5181337929823435e-06, "loss": 0.7902, "step": 5281 }, { "epoch": 2.1971765072927227, "grad_norm": 1.7626194953918457, "learning_rate": 3.514722771167378e-06, "loss": 0.7806, "step": 5282 }, { "epoch": 2.1975924914853238, "grad_norm": 1.7221252918243408, "learning_rate": 3.5113130511530146e-06, "loss": 0.7834, "step": 5283 }, { "epoch": 2.1980084756779243, "grad_norm": 1.801406979560852, "learning_rate": 3.507904633623699e-06, "loss": 0.791, "step": 5284 }, { "epoch": 2.198424459870525, "grad_norm": 172.13836669921875, "learning_rate": 3.5044975192636e-06, "loss": 0.6723, "step": 5285 }, { "epoch": 2.1988404440631255, "grad_norm": 1.8665391206741333, "learning_rate": 3.501091708756642e-06, "loss": 0.8213, "step": 5286 }, { "epoch": 2.199256428255726, "grad_norm": 2.017426013946533, "learning_rate": 3.4976872027864726e-06, "loss": 0.8468, "step": 5287 }, { "epoch": 2.199672412448327, "grad_norm": 1.9625240564346313, "learning_rate": 3.494284002036489e-06, "loss": 0.8716, "step": 5288 }, { "epoch": 2.2000883966409277, "grad_norm": 1.9443635940551758, "learning_rate": 3.490882107189819e-06, "loss": 0.7945, "step": 5289 }, { "epoch": 2.2005043808335283, "grad_norm": 1.727624535560608, "learning_rate": 3.487481518929331e-06, "loss": 0.7318, "step": 5290 }, { "epoch": 2.200920365026129, "grad_norm": 1.8450473546981812, "learning_rate": 3.484082237937627e-06, "loss": 0.8405, "step": 5291 }, { "epoch": 2.20133634921873, "grad_norm": 1.7749232053756714, "learning_rate": 3.4806842648970575e-06, "loss": 0.8205, "step": 5292 }, { "epoch": 2.2017523334113305, "grad_norm": 1.9306164979934692, "learning_rate": 3.477287600489696e-06, "loss": 0.7744, "step": 5293 }, { "epoch": 2.202168317603931, "grad_norm": 2.239596128463745, "learning_rate": 3.4738922453973644e-06, "loss": 0.8121, "step": 5294 }, { "epoch": 2.2025843017965316, "grad_norm": 1.8506404161453247, "learning_rate": 3.4704982003016207e-06, "loss": 0.5906, "step": 5295 }, { "epoch": 2.2030002859891322, "grad_norm": 1.9245649576187134, "learning_rate": 3.4671054658837523e-06, "loss": 0.8043, "step": 5296 }, { "epoch": 2.2034162701817332, "grad_norm": 1.7883727550506592, "learning_rate": 3.4637140428247927e-06, "loss": 0.7746, "step": 5297 }, { "epoch": 2.203832254374334, "grad_norm": 1.731395959854126, "learning_rate": 3.460323931805507e-06, "loss": 0.7254, "step": 5298 }, { "epoch": 2.2042482385669344, "grad_norm": 1.8635908365249634, "learning_rate": 3.456935133506397e-06, "loss": 0.7831, "step": 5299 }, { "epoch": 2.204664222759535, "grad_norm": 1.8748648166656494, "learning_rate": 3.453547648607698e-06, "loss": 0.8762, "step": 5300 }, { "epoch": 2.205080206952136, "grad_norm": 29.801210403442383, "learning_rate": 3.450161477789393e-06, "loss": 0.8392, "step": 5301 }, { "epoch": 2.2054961911447366, "grad_norm": 1.864582896232605, "learning_rate": 3.446776621731186e-06, "loss": 0.8738, "step": 5302 }, { "epoch": 2.205912175337337, "grad_norm": 1.7389981746673584, "learning_rate": 3.4433930811125327e-06, "loss": 0.7697, "step": 5303 }, { "epoch": 2.2063281595299378, "grad_norm": 1.9396461248397827, "learning_rate": 3.44001085661261e-06, "loss": 0.8193, "step": 5304 }, { "epoch": 2.2067441437225384, "grad_norm": 104.966552734375, "learning_rate": 3.436629948910344e-06, "loss": 0.8065, "step": 5305 }, { "epoch": 2.2071601279151394, "grad_norm": 1.939540982246399, "learning_rate": 3.4332503586843835e-06, "loss": 0.8056, "step": 5306 }, { "epoch": 2.20757611210774, "grad_norm": 1.768723726272583, "learning_rate": 3.4298720866131254e-06, "loss": 0.8211, "step": 5307 }, { "epoch": 2.2079920963003405, "grad_norm": 2.251084804534912, "learning_rate": 3.4264951333746932e-06, "loss": 0.7094, "step": 5308 }, { "epoch": 2.208408080492941, "grad_norm": 1.9498095512390137, "learning_rate": 3.4231194996469443e-06, "loss": 0.8431, "step": 5309 }, { "epoch": 2.208824064685542, "grad_norm": 1.9106277227401733, "learning_rate": 3.419745186107483e-06, "loss": 0.9728, "step": 5310 }, { "epoch": 2.2092400488781427, "grad_norm": 1.8768904209136963, "learning_rate": 3.4163721934336337e-06, "loss": 0.6254, "step": 5311 }, { "epoch": 2.2096560330707433, "grad_norm": 1.8634529113769531, "learning_rate": 3.413000522302469e-06, "loss": 0.7842, "step": 5312 }, { "epoch": 2.210072017263344, "grad_norm": 1.9201453924179077, "learning_rate": 3.4096301733907854e-06, "loss": 0.8237, "step": 5313 }, { "epoch": 2.2104880014559445, "grad_norm": 1.8409569263458252, "learning_rate": 3.4062611473751226e-06, "loss": 0.7841, "step": 5314 }, { "epoch": 2.2109039856485455, "grad_norm": 118.04414367675781, "learning_rate": 3.4028934449317473e-06, "loss": 0.5985, "step": 5315 }, { "epoch": 2.211319969841146, "grad_norm": 1.9516149759292603, "learning_rate": 3.3995270667366675e-06, "loss": 0.8172, "step": 5316 }, { "epoch": 2.2117359540337467, "grad_norm": 1.8995810747146606, "learning_rate": 3.3961620134656216e-06, "loss": 0.7595, "step": 5317 }, { "epoch": 2.2121519382263473, "grad_norm": 4.554584980010986, "learning_rate": 3.392798285794081e-06, "loss": 0.8094, "step": 5318 }, { "epoch": 2.2125679224189483, "grad_norm": 1.91974675655365, "learning_rate": 3.3894358843972485e-06, "loss": 0.7955, "step": 5319 }, { "epoch": 2.212983906611549, "grad_norm": 2.0827555656433105, "learning_rate": 3.3860748099500716e-06, "loss": 0.6323, "step": 5320 }, { "epoch": 2.2133998908041495, "grad_norm": 1.8832755088806152, "learning_rate": 3.3827150631272188e-06, "loss": 0.8046, "step": 5321 }, { "epoch": 2.21381587499675, "grad_norm": 1.9807627201080322, "learning_rate": 3.3793566446031036e-06, "loss": 0.7996, "step": 5322 }, { "epoch": 2.2142318591893506, "grad_norm": 2.0612804889678955, "learning_rate": 3.3759995550518597e-06, "loss": 0.7985, "step": 5323 }, { "epoch": 2.2146478433819516, "grad_norm": 1.8286055326461792, "learning_rate": 3.3726437951473666e-06, "loss": 0.8473, "step": 5324 }, { "epoch": 2.2150638275745522, "grad_norm": 1.768884301185608, "learning_rate": 3.3692893655632332e-06, "loss": 0.7421, "step": 5325 }, { "epoch": 2.215479811767153, "grad_norm": 22.10213851928711, "learning_rate": 3.3659362669727922e-06, "loss": 0.7895, "step": 5326 }, { "epoch": 2.2158957959597534, "grad_norm": 1.921110987663269, "learning_rate": 3.3625845000491265e-06, "loss": 0.7384, "step": 5327 }, { "epoch": 2.2163117801523544, "grad_norm": 1.8063420057296753, "learning_rate": 3.359234065465037e-06, "loss": 0.6888, "step": 5328 }, { "epoch": 2.216727764344955, "grad_norm": 2.212836265563965, "learning_rate": 3.3558849638930613e-06, "loss": 0.8034, "step": 5329 }, { "epoch": 2.2171437485375556, "grad_norm": 21.149229049682617, "learning_rate": 3.3525371960054675e-06, "loss": 0.7114, "step": 5330 }, { "epoch": 2.217559732730156, "grad_norm": 1.7000916004180908, "learning_rate": 3.349190762474265e-06, "loss": 0.7513, "step": 5331 }, { "epoch": 2.2179757169227567, "grad_norm": 1.8191053867340088, "learning_rate": 3.3458456639711834e-06, "loss": 0.8541, "step": 5332 }, { "epoch": 2.2183917011153578, "grad_norm": 2.012403726577759, "learning_rate": 3.342501901167696e-06, "loss": 0.7572, "step": 5333 }, { "epoch": 2.2188076853079584, "grad_norm": 1.9099805355072021, "learning_rate": 3.339159474734994e-06, "loss": 0.7709, "step": 5334 }, { "epoch": 2.219223669500559, "grad_norm": 1.78072190284729, "learning_rate": 3.3358183853440173e-06, "loss": 0.7838, "step": 5335 }, { "epoch": 2.2196396536931595, "grad_norm": 5.3864264488220215, "learning_rate": 3.332478633665421e-06, "loss": 0.7687, "step": 5336 }, { "epoch": 2.2200556378857605, "grad_norm": 1.807645559310913, "learning_rate": 3.3291402203696043e-06, "loss": 0.8576, "step": 5337 }, { "epoch": 2.220471622078361, "grad_norm": 1.7884447574615479, "learning_rate": 3.3258031461266905e-06, "loss": 0.7406, "step": 5338 }, { "epoch": 2.2208876062709617, "grad_norm": 1.9699897766113281, "learning_rate": 3.3224674116065323e-06, "loss": 0.7724, "step": 5339 }, { "epoch": 2.2213035904635623, "grad_norm": 1.8687282800674438, "learning_rate": 3.319133017478725e-06, "loss": 0.8571, "step": 5340 }, { "epoch": 2.221719574656163, "grad_norm": 1.826531171798706, "learning_rate": 3.3157999644125795e-06, "loss": 0.864, "step": 5341 }, { "epoch": 2.222135558848764, "grad_norm": 1.843608021736145, "learning_rate": 3.312468253077151e-06, "loss": 0.7902, "step": 5342 }, { "epoch": 2.2225515430413645, "grad_norm": 1.7392972707748413, "learning_rate": 3.309137884141215e-06, "loss": 0.6564, "step": 5343 }, { "epoch": 2.222967527233965, "grad_norm": 7.1272759437561035, "learning_rate": 3.305808858273286e-06, "loss": 0.8584, "step": 5344 }, { "epoch": 2.2233835114265657, "grad_norm": 1.8199493885040283, "learning_rate": 3.302481176141601e-06, "loss": 0.7115, "step": 5345 }, { "epoch": 2.2237994956191667, "grad_norm": 2.2787978649139404, "learning_rate": 3.299154838414137e-06, "loss": 0.8093, "step": 5346 }, { "epoch": 2.2242154798117673, "grad_norm": 1.7833164930343628, "learning_rate": 3.29582984575859e-06, "loss": 0.7769, "step": 5347 }, { "epoch": 2.224631464004368, "grad_norm": 1.7083004713058472, "learning_rate": 3.2925061988423947e-06, "loss": 0.8063, "step": 5348 }, { "epoch": 2.2250474481969684, "grad_norm": 1.917017936706543, "learning_rate": 3.2891838983327074e-06, "loss": 0.7649, "step": 5349 }, { "epoch": 2.225463432389569, "grad_norm": 2.081723928451538, "learning_rate": 3.285862944896425e-06, "loss": 0.7931, "step": 5350 }, { "epoch": 2.22587941658217, "grad_norm": 2.011575222015381, "learning_rate": 3.2825433392001625e-06, "loss": 0.7943, "step": 5351 }, { "epoch": 2.2262954007747706, "grad_norm": 1.9975409507751465, "learning_rate": 3.2792250819102768e-06, "loss": 0.8718, "step": 5352 }, { "epoch": 2.226711384967371, "grad_norm": 1.807413101196289, "learning_rate": 3.275908173692839e-06, "loss": 0.7448, "step": 5353 }, { "epoch": 2.227127369159972, "grad_norm": 1.9843825101852417, "learning_rate": 3.272592615213662e-06, "loss": 0.9548, "step": 5354 }, { "epoch": 2.227543353352573, "grad_norm": 1.877913236618042, "learning_rate": 3.2692784071382878e-06, "loss": 0.8532, "step": 5355 }, { "epoch": 2.2279593375451734, "grad_norm": 28.90621566772461, "learning_rate": 3.2659655501319787e-06, "loss": 0.7048, "step": 5356 }, { "epoch": 2.228375321737774, "grad_norm": 2.0623323917388916, "learning_rate": 3.26265404485973e-06, "loss": 0.8287, "step": 5357 }, { "epoch": 2.2287913059303746, "grad_norm": 1.9406317472457886, "learning_rate": 3.259343891986262e-06, "loss": 0.7747, "step": 5358 }, { "epoch": 2.229207290122975, "grad_norm": 1.921051025390625, "learning_rate": 3.2560350921760343e-06, "loss": 0.9031, "step": 5359 }, { "epoch": 2.229623274315576, "grad_norm": 2.005985975265503, "learning_rate": 3.25272764609322e-06, "loss": 0.8668, "step": 5360 }, { "epoch": 2.2300392585081767, "grad_norm": 7.723353385925293, "learning_rate": 3.2494215544017382e-06, "loss": 0.735, "step": 5361 }, { "epoch": 2.2304552427007773, "grad_norm": 1.7588517665863037, "learning_rate": 3.246116817765217e-06, "loss": 0.7261, "step": 5362 }, { "epoch": 2.230871226893378, "grad_norm": 1.832783579826355, "learning_rate": 3.2428134368470276e-06, "loss": 0.7483, "step": 5363 }, { "epoch": 2.231287211085979, "grad_norm": 1.86799156665802, "learning_rate": 3.23951141231026e-06, "loss": 0.7547, "step": 5364 }, { "epoch": 2.2317031952785795, "grad_norm": 1.9852381944656372, "learning_rate": 3.2362107448177384e-06, "loss": 0.8844, "step": 5365 }, { "epoch": 2.23211917947118, "grad_norm": 1.8015494346618652, "learning_rate": 3.2329114350320056e-06, "loss": 0.8002, "step": 5366 }, { "epoch": 2.2325351636637807, "grad_norm": 2.0105440616607666, "learning_rate": 3.229613483615346e-06, "loss": 0.8978, "step": 5367 }, { "epoch": 2.2329511478563813, "grad_norm": 1.82152259349823, "learning_rate": 3.226316891229757e-06, "loss": 0.7574, "step": 5368 }, { "epoch": 2.2333671320489823, "grad_norm": 1.9259718656539917, "learning_rate": 3.223021658536968e-06, "loss": 0.8324, "step": 5369 }, { "epoch": 2.233783116241583, "grad_norm": 1.8555612564086914, "learning_rate": 3.219727786198442e-06, "loss": 0.7968, "step": 5370 }, { "epoch": 2.2341991004341835, "grad_norm": 1.7408933639526367, "learning_rate": 3.216435274875357e-06, "loss": 0.7426, "step": 5371 }, { "epoch": 2.234615084626784, "grad_norm": 1.6974366903305054, "learning_rate": 3.213144125228631e-06, "loss": 0.7763, "step": 5372 }, { "epoch": 2.235031068819385, "grad_norm": 1.7979482412338257, "learning_rate": 3.2098543379188963e-06, "loss": 0.7263, "step": 5373 }, { "epoch": 2.2354470530119857, "grad_norm": 1.7422016859054565, "learning_rate": 3.206565913606523e-06, "loss": 0.7918, "step": 5374 }, { "epoch": 2.2358630372045862, "grad_norm": 1.8391250371932983, "learning_rate": 3.2032788529515956e-06, "loss": 0.8335, "step": 5375 }, { "epoch": 2.236279021397187, "grad_norm": 1.8145339488983154, "learning_rate": 3.199993156613939e-06, "loss": 0.7346, "step": 5376 }, { "epoch": 2.2366950055897874, "grad_norm": 459.46124267578125, "learning_rate": 3.196708825253091e-06, "loss": 0.666, "step": 5377 }, { "epoch": 2.2371109897823884, "grad_norm": 2.0344152450561523, "learning_rate": 3.1934258595283216e-06, "loss": 0.8433, "step": 5378 }, { "epoch": 2.237526973974989, "grad_norm": 1.9731383323669434, "learning_rate": 3.1901442600986253e-06, "loss": 0.8523, "step": 5379 }, { "epoch": 2.2379429581675896, "grad_norm": 1.729577660560608, "learning_rate": 3.1868640276227268e-06, "loss": 0.8219, "step": 5380 }, { "epoch": 2.23835894236019, "grad_norm": 2.1799087524414062, "learning_rate": 3.1835851627590674e-06, "loss": 0.855, "step": 5381 }, { "epoch": 2.238774926552791, "grad_norm": 1.8316904306411743, "learning_rate": 3.180307666165825e-06, "loss": 0.7824, "step": 5382 }, { "epoch": 2.239190910745392, "grad_norm": 1.7768597602844238, "learning_rate": 3.177031538500891e-06, "loss": 0.6811, "step": 5383 }, { "epoch": 2.2396068949379924, "grad_norm": 1.8736350536346436, "learning_rate": 3.173756780421892e-06, "loss": 0.8256, "step": 5384 }, { "epoch": 2.240022879130593, "grad_norm": 6.671579837799072, "learning_rate": 3.1704833925861767e-06, "loss": 0.8322, "step": 5385 }, { "epoch": 2.2404388633231935, "grad_norm": 1.8610728979110718, "learning_rate": 3.167211375650816e-06, "loss": 0.8378, "step": 5386 }, { "epoch": 2.2408548475157946, "grad_norm": 2.113386631011963, "learning_rate": 3.163940730272608e-06, "loss": 0.798, "step": 5387 }, { "epoch": 2.241270831708395, "grad_norm": 1.703486680984497, "learning_rate": 3.1606714571080697e-06, "loss": 0.6937, "step": 5388 }, { "epoch": 2.2416868159009957, "grad_norm": 1.8761167526245117, "learning_rate": 3.1574035568134564e-06, "loss": 0.7583, "step": 5389 }, { "epoch": 2.2421028000935963, "grad_norm": 10.639949798583984, "learning_rate": 3.1541370300447317e-06, "loss": 0.8585, "step": 5390 }, { "epoch": 2.2425187842861973, "grad_norm": 1.9035831689834595, "learning_rate": 3.1508718774575974e-06, "loss": 0.8243, "step": 5391 }, { "epoch": 2.242934768478798, "grad_norm": 1.9437410831451416, "learning_rate": 3.1476080997074677e-06, "loss": 0.7534, "step": 5392 }, { "epoch": 2.2433507526713985, "grad_norm": 2.1249618530273438, "learning_rate": 3.14434569744949e-06, "loss": 0.8281, "step": 5393 }, { "epoch": 2.243766736863999, "grad_norm": 6.747101306915283, "learning_rate": 3.141084671338528e-06, "loss": 0.7523, "step": 5394 }, { "epoch": 2.2441827210565997, "grad_norm": 1.7793229818344116, "learning_rate": 3.1378250220291784e-06, "loss": 0.8512, "step": 5395 }, { "epoch": 2.2445987052492007, "grad_norm": 56.89495086669922, "learning_rate": 3.1345667501757525e-06, "loss": 0.8691, "step": 5396 }, { "epoch": 2.2450146894418013, "grad_norm": 1.844678282737732, "learning_rate": 3.1313098564322898e-06, "loss": 0.8371, "step": 5397 }, { "epoch": 2.245430673634402, "grad_norm": 1.8676340579986572, "learning_rate": 3.1280543414525477e-06, "loss": 0.7307, "step": 5398 }, { "epoch": 2.2458466578270024, "grad_norm": 2.021735668182373, "learning_rate": 3.1248002058900185e-06, "loss": 0.8348, "step": 5399 }, { "epoch": 2.2462626420196035, "grad_norm": 1.8459666967391968, "learning_rate": 3.1215474503979027e-06, "loss": 0.8827, "step": 5400 }, { "epoch": 2.246678626212204, "grad_norm": 43.65043258666992, "learning_rate": 3.118296075629136e-06, "loss": 0.6771, "step": 5401 }, { "epoch": 2.2470946104048046, "grad_norm": 2.3608241081237793, "learning_rate": 3.115046082236375e-06, "loss": 0.7675, "step": 5402 }, { "epoch": 2.247510594597405, "grad_norm": 2.000380754470825, "learning_rate": 3.111797470871989e-06, "loss": 0.8467, "step": 5403 }, { "epoch": 2.247926578790006, "grad_norm": 1.7587710618972778, "learning_rate": 3.1085502421880853e-06, "loss": 0.6858, "step": 5404 }, { "epoch": 2.248342562982607, "grad_norm": 1.8220257759094238, "learning_rate": 3.1053043968364772e-06, "loss": 0.8038, "step": 5405 }, { "epoch": 2.2487585471752074, "grad_norm": 2.071559190750122, "learning_rate": 3.1020599354687175e-06, "loss": 0.9134, "step": 5406 }, { "epoch": 2.249174531367808, "grad_norm": 1.921759843826294, "learning_rate": 3.098816858736067e-06, "loss": 0.7593, "step": 5407 }, { "epoch": 2.2495905155604086, "grad_norm": 1.936944603919983, "learning_rate": 3.0955751672895153e-06, "loss": 0.7564, "step": 5408 }, { "epoch": 2.2500064997530096, "grad_norm": 1.8884817361831665, "learning_rate": 3.092334861779769e-06, "loss": 0.7631, "step": 5409 }, { "epoch": 2.25042248394561, "grad_norm": 2.0744404792785645, "learning_rate": 3.0890959428572665e-06, "loss": 0.8302, "step": 5410 }, { "epoch": 2.2508384681382108, "grad_norm": 13.466004371643066, "learning_rate": 3.0858584111721535e-06, "loss": 0.8108, "step": 5411 }, { "epoch": 2.2512544523308113, "grad_norm": 1.815356969833374, "learning_rate": 3.0826222673743144e-06, "loss": 0.7005, "step": 5412 }, { "epoch": 2.251670436523412, "grad_norm": 2.034930467605591, "learning_rate": 3.079387512113338e-06, "loss": 0.7931, "step": 5413 }, { "epoch": 2.252086420716013, "grad_norm": 1.8689861297607422, "learning_rate": 3.076154146038548e-06, "loss": 0.8291, "step": 5414 }, { "epoch": 2.2525024049086135, "grad_norm": 1.9936765432357788, "learning_rate": 3.0729221697989775e-06, "loss": 0.7675, "step": 5415 }, { "epoch": 2.252918389101214, "grad_norm": 1.7166961431503296, "learning_rate": 3.069691584043394e-06, "loss": 0.6973, "step": 5416 }, { "epoch": 2.2533343732938147, "grad_norm": 2.0030007362365723, "learning_rate": 3.066462389420274e-06, "loss": 0.7626, "step": 5417 }, { "epoch": 2.2537503574864157, "grad_norm": 1.854103446006775, "learning_rate": 3.0632345865778146e-06, "loss": 0.8602, "step": 5418 }, { "epoch": 2.2541663416790163, "grad_norm": 1.8020752668380737, "learning_rate": 3.060008176163948e-06, "loss": 0.9307, "step": 5419 }, { "epoch": 2.254582325871617, "grad_norm": 28.168794631958008, "learning_rate": 3.0567831588263075e-06, "loss": 0.7843, "step": 5420 }, { "epoch": 2.2549983100642175, "grad_norm": 1.791948914527893, "learning_rate": 3.0535595352122636e-06, "loss": 0.7432, "step": 5421 }, { "epoch": 2.255414294256818, "grad_norm": 1.8912432193756104, "learning_rate": 3.0503373059688947e-06, "loss": 0.9033, "step": 5422 }, { "epoch": 2.255830278449419, "grad_norm": 1.843916654586792, "learning_rate": 3.047116471743009e-06, "loss": 0.7175, "step": 5423 }, { "epoch": 2.2562462626420197, "grad_norm": 1.75777268409729, "learning_rate": 3.0438970331811235e-06, "loss": 0.6735, "step": 5424 }, { "epoch": 2.2566622468346202, "grad_norm": 9.021940231323242, "learning_rate": 3.0406789909294897e-06, "loss": 0.809, "step": 5425 }, { "epoch": 2.257078231027221, "grad_norm": 1.7712006568908691, "learning_rate": 3.0374623456340645e-06, "loss": 0.7858, "step": 5426 }, { "epoch": 2.257494215219822, "grad_norm": 4.752316951751709, "learning_rate": 3.0342470979405337e-06, "loss": 0.8293, "step": 5427 }, { "epoch": 2.2579101994124224, "grad_norm": 1.7860535383224487, "learning_rate": 3.031033248494294e-06, "loss": 0.8295, "step": 5428 }, { "epoch": 2.258326183605023, "grad_norm": 1.9252986907958984, "learning_rate": 3.027820797940474e-06, "loss": 0.8055, "step": 5429 }, { "epoch": 2.2587421677976236, "grad_norm": 1.8181335926055908, "learning_rate": 3.024609746923909e-06, "loss": 0.7692, "step": 5430 }, { "epoch": 2.259158151990224, "grad_norm": 1.8601833581924438, "learning_rate": 3.0214000960891586e-06, "loss": 0.8154, "step": 5431 }, { "epoch": 2.259574136182825, "grad_norm": 1.8533669710159302, "learning_rate": 3.0181918460805084e-06, "loss": 0.8262, "step": 5432 }, { "epoch": 2.259990120375426, "grad_norm": 1.7258317470550537, "learning_rate": 3.0149849975419465e-06, "loss": 0.6682, "step": 5433 }, { "epoch": 2.2604061045680264, "grad_norm": 1.8752912282943726, "learning_rate": 3.0117795511171978e-06, "loss": 0.6791, "step": 5434 }, { "epoch": 2.260822088760627, "grad_norm": 1.9990400075912476, "learning_rate": 3.0085755074496916e-06, "loss": 0.7929, "step": 5435 }, { "epoch": 2.261238072953228, "grad_norm": 4.1844658851623535, "learning_rate": 3.005372867182582e-06, "loss": 0.8339, "step": 5436 }, { "epoch": 2.2616540571458286, "grad_norm": 1.883150577545166, "learning_rate": 3.002171630958738e-06, "loss": 0.8534, "step": 5437 }, { "epoch": 2.262070041338429, "grad_norm": 1.8325997591018677, "learning_rate": 2.9989717994207546e-06, "loss": 0.7908, "step": 5438 }, { "epoch": 2.2624860255310297, "grad_norm": 1.7042127847671509, "learning_rate": 2.9957733732109316e-06, "loss": 0.6275, "step": 5439 }, { "epoch": 2.2629020097236303, "grad_norm": 2.022932529449463, "learning_rate": 2.992576352971304e-06, "loss": 0.8367, "step": 5440 }, { "epoch": 2.2633179939162313, "grad_norm": 1.842630386352539, "learning_rate": 2.989380739343606e-06, "loss": 0.8254, "step": 5441 }, { "epoch": 2.263733978108832, "grad_norm": 1.8347159624099731, "learning_rate": 2.986186532969306e-06, "loss": 0.6657, "step": 5442 }, { "epoch": 2.2641499623014325, "grad_norm": 1.6677082777023315, "learning_rate": 2.9829937344895753e-06, "loss": 0.8005, "step": 5443 }, { "epoch": 2.264565946494033, "grad_norm": 1.7799949645996094, "learning_rate": 2.979802344545317e-06, "loss": 0.8327, "step": 5444 }, { "epoch": 2.264981930686634, "grad_norm": 2.161184549331665, "learning_rate": 2.9766123637771384e-06, "loss": 0.7884, "step": 5445 }, { "epoch": 2.2653979148792347, "grad_norm": 1.8703166246414185, "learning_rate": 2.973423792825374e-06, "loss": 0.7325, "step": 5446 }, { "epoch": 2.2658138990718353, "grad_norm": 1.7684204578399658, "learning_rate": 2.970236632330069e-06, "loss": 0.8096, "step": 5447 }, { "epoch": 2.266229883264436, "grad_norm": 1.9428753852844238, "learning_rate": 2.967050882930985e-06, "loss": 0.7913, "step": 5448 }, { "epoch": 2.2666458674570364, "grad_norm": 1.8232086896896362, "learning_rate": 2.9638665452676084e-06, "loss": 0.8797, "step": 5449 }, { "epoch": 2.2670618516496375, "grad_norm": 1.8376432657241821, "learning_rate": 2.96068361997913e-06, "loss": 0.859, "step": 5450 }, { "epoch": 2.267477835842238, "grad_norm": 1.7994533777236938, "learning_rate": 2.95750210770447e-06, "loss": 0.9072, "step": 5451 }, { "epoch": 2.2678938200348386, "grad_norm": 4.955056190490723, "learning_rate": 2.954322009082253e-06, "loss": 0.7241, "step": 5452 }, { "epoch": 2.268309804227439, "grad_norm": 57.6809196472168, "learning_rate": 2.9511433247508325e-06, "loss": 0.7711, "step": 5453 }, { "epoch": 2.2687257884200402, "grad_norm": 1936.0162353515625, "learning_rate": 2.9479660553482637e-06, "loss": 0.7186, "step": 5454 }, { "epoch": 2.269141772612641, "grad_norm": 1.9661498069763184, "learning_rate": 2.944790201512331e-06, "loss": 0.803, "step": 5455 }, { "epoch": 2.2695577568052414, "grad_norm": 1.8679122924804688, "learning_rate": 2.9416157638805276e-06, "loss": 0.7713, "step": 5456 }, { "epoch": 2.269973740997842, "grad_norm": 1.9900058507919312, "learning_rate": 2.938442743090062e-06, "loss": 0.783, "step": 5457 }, { "epoch": 2.2703897251904426, "grad_norm": 2.107917070388794, "learning_rate": 2.9352711397778576e-06, "loss": 0.7914, "step": 5458 }, { "epoch": 2.2708057093830436, "grad_norm": 1.903920292854309, "learning_rate": 2.9321009545805624e-06, "loss": 0.8726, "step": 5459 }, { "epoch": 2.271221693575644, "grad_norm": 1.8860116004943848, "learning_rate": 2.9289321881345257e-06, "loss": 0.7473, "step": 5460 }, { "epoch": 2.2716376777682448, "grad_norm": 1.852716088294983, "learning_rate": 2.9257648410758233e-06, "loss": 0.7431, "step": 5461 }, { "epoch": 2.2720536619608453, "grad_norm": 2.016904592514038, "learning_rate": 2.9225989140402445e-06, "loss": 0.8635, "step": 5462 }, { "epoch": 2.2724696461534464, "grad_norm": 1.970158576965332, "learning_rate": 2.9194344076632864e-06, "loss": 0.8616, "step": 5463 }, { "epoch": 2.272885630346047, "grad_norm": 1.9124550819396973, "learning_rate": 2.9162713225801696e-06, "loss": 0.8028, "step": 5464 }, { "epoch": 2.2733016145386475, "grad_norm": 1.872523307800293, "learning_rate": 2.9131096594258235e-06, "loss": 0.7519, "step": 5465 }, { "epoch": 2.273717598731248, "grad_norm": 1.8831247091293335, "learning_rate": 2.909949418834895e-06, "loss": 0.6907, "step": 5466 }, { "epoch": 2.2741335829238487, "grad_norm": 1.8988821506500244, "learning_rate": 2.9067906014417403e-06, "loss": 0.9224, "step": 5467 }, { "epoch": 2.2745495671164497, "grad_norm": 7.1796979904174805, "learning_rate": 2.9036332078804407e-06, "loss": 0.7966, "step": 5468 }, { "epoch": 2.2749655513090503, "grad_norm": 1.7788828611373901, "learning_rate": 2.9004772387847792e-06, "loss": 0.7695, "step": 5469 }, { "epoch": 2.275381535501651, "grad_norm": 128.2947235107422, "learning_rate": 2.897322694788264e-06, "loss": 0.7491, "step": 5470 }, { "epoch": 2.2757975196942515, "grad_norm": 2.0100514888763428, "learning_rate": 2.8941695765241075e-06, "loss": 0.6892, "step": 5471 }, { "epoch": 2.2762135038868525, "grad_norm": 1.7790110111236572, "learning_rate": 2.8910178846252446e-06, "loss": 0.8692, "step": 5472 }, { "epoch": 2.276629488079453, "grad_norm": 1.9530867338180542, "learning_rate": 2.887867619724314e-06, "loss": 0.8134, "step": 5473 }, { "epoch": 2.2770454722720537, "grad_norm": 49.65654373168945, "learning_rate": 2.8847187824536817e-06, "loss": 0.7774, "step": 5474 }, { "epoch": 2.2774614564646543, "grad_norm": 1.719744086265564, "learning_rate": 2.881571373445413e-06, "loss": 0.7772, "step": 5475 }, { "epoch": 2.277877440657255, "grad_norm": 1.7914246320724487, "learning_rate": 2.8784253933312923e-06, "loss": 0.7185, "step": 5476 }, { "epoch": 2.278293424849856, "grad_norm": 1.9502302408218384, "learning_rate": 2.8752808427428225e-06, "loss": 0.841, "step": 5477 }, { "epoch": 2.2787094090424564, "grad_norm": 1.9872236251831055, "learning_rate": 2.8721377223112078e-06, "loss": 0.8842, "step": 5478 }, { "epoch": 2.279125393235057, "grad_norm": 1.77939772605896, "learning_rate": 2.8689960326673794e-06, "loss": 0.7612, "step": 5479 }, { "epoch": 2.2795413774276576, "grad_norm": 1.9166769981384277, "learning_rate": 2.865855774441968e-06, "loss": 0.8317, "step": 5480 }, { "epoch": 2.2799573616202586, "grad_norm": 1.9627313613891602, "learning_rate": 2.8627169482653273e-06, "loss": 0.7295, "step": 5481 }, { "epoch": 2.280373345812859, "grad_norm": 2.1139469146728516, "learning_rate": 2.8595795547675133e-06, "loss": 0.8049, "step": 5482 }, { "epoch": 2.28078933000546, "grad_norm": 1.996202826499939, "learning_rate": 2.8564435945783078e-06, "loss": 0.8478, "step": 5483 }, { "epoch": 2.2812053141980604, "grad_norm": 2.00169038772583, "learning_rate": 2.8533090683271903e-06, "loss": 0.7769, "step": 5484 }, { "epoch": 2.281621298390661, "grad_norm": 40.795406341552734, "learning_rate": 2.8501759766433668e-06, "loss": 0.7556, "step": 5485 }, { "epoch": 2.282037282583262, "grad_norm": 1.9795732498168945, "learning_rate": 2.847044320155743e-06, "loss": 0.8945, "step": 5486 }, { "epoch": 2.2824532667758626, "grad_norm": 1.9693291187286377, "learning_rate": 2.8439140994929427e-06, "loss": 0.8647, "step": 5487 }, { "epoch": 2.282869250968463, "grad_norm": 1.9678884744644165, "learning_rate": 2.8407853152832975e-06, "loss": 0.8948, "step": 5488 }, { "epoch": 2.2832852351610637, "grad_norm": 1.9158090353012085, "learning_rate": 2.83765796815486e-06, "loss": 0.8349, "step": 5489 }, { "epoch": 2.2837012193536648, "grad_norm": 1.7852706909179688, "learning_rate": 2.8345320587353798e-06, "loss": 0.843, "step": 5490 }, { "epoch": 2.2841172035462654, "grad_norm": 13.863811492919922, "learning_rate": 2.8314075876523308e-06, "loss": 0.8121, "step": 5491 }, { "epoch": 2.284533187738866, "grad_norm": 1.8742523193359375, "learning_rate": 2.8282845555328965e-06, "loss": 0.9295, "step": 5492 }, { "epoch": 2.2849491719314665, "grad_norm": 1.8850445747375488, "learning_rate": 2.8251629630039612e-06, "loss": 0.8202, "step": 5493 }, { "epoch": 2.285365156124067, "grad_norm": 1.9393553733825684, "learning_rate": 2.822042810692134e-06, "loss": 0.8079, "step": 5494 }, { "epoch": 2.285781140316668, "grad_norm": 4.674871444702148, "learning_rate": 2.8189240992237256e-06, "loss": 0.6577, "step": 5495 }, { "epoch": 2.2861971245092687, "grad_norm": 25.218189239501953, "learning_rate": 2.8158068292247597e-06, "loss": 0.8423, "step": 5496 }, { "epoch": 2.2866131087018693, "grad_norm": 1.9171175956726074, "learning_rate": 2.8126910013209684e-06, "loss": 0.8155, "step": 5497 }, { "epoch": 2.28702909289447, "grad_norm": 1.9434565305709839, "learning_rate": 2.8095766161378026e-06, "loss": 0.7525, "step": 5498 }, { "epoch": 2.287445077087071, "grad_norm": 80.61231231689453, "learning_rate": 2.8064636743004124e-06, "loss": 0.8423, "step": 5499 }, { "epoch": 2.2878610612796715, "grad_norm": 1.9894691705703735, "learning_rate": 2.80335217643367e-06, "loss": 0.7726, "step": 5500 }, { "epoch": 2.2878610612796715, "eval_loss": 0.7550570368766785, "eval_runtime": 2030.9677, "eval_samples_per_second": 3.245, "eval_steps_per_second": 1.623, "step": 5500 }, { "epoch": 2.288277045472272, "grad_norm": 1.8332877159118652, "learning_rate": 2.8002421231621457e-06, "loss": 0.7287, "step": 5501 }, { "epoch": 2.2886930296648726, "grad_norm": 97.85924530029297, "learning_rate": 2.797133515110132e-06, "loss": 0.8579, "step": 5502 }, { "epoch": 2.2891090138574732, "grad_norm": 1.8542379140853882, "learning_rate": 2.794026352901619e-06, "loss": 0.7713, "step": 5503 }, { "epoch": 2.2895249980500743, "grad_norm": 1.9284858703613281, "learning_rate": 2.790920637160317e-06, "loss": 0.8195, "step": 5504 }, { "epoch": 2.289940982242675, "grad_norm": 1.6924175024032593, "learning_rate": 2.7878163685096406e-06, "loss": 0.6996, "step": 5505 }, { "epoch": 2.2903569664352754, "grad_norm": 44.404598236083984, "learning_rate": 2.7847135475727094e-06, "loss": 0.7882, "step": 5506 }, { "epoch": 2.290772950627876, "grad_norm": 1.9441603422164917, "learning_rate": 2.781612174972367e-06, "loss": 0.842, "step": 5507 }, { "epoch": 2.291188934820477, "grad_norm": 1.8318150043487549, "learning_rate": 2.778512251331149e-06, "loss": 0.7365, "step": 5508 }, { "epoch": 2.2916049190130776, "grad_norm": 1.9982166290283203, "learning_rate": 2.7754137772713153e-06, "loss": 0.8804, "step": 5509 }, { "epoch": 2.292020903205678, "grad_norm": 1.9237959384918213, "learning_rate": 2.772316753414821e-06, "loss": 0.7381, "step": 5510 }, { "epoch": 2.2924368873982788, "grad_norm": 2.0113637447357178, "learning_rate": 2.7692211803833437e-06, "loss": 0.7239, "step": 5511 }, { "epoch": 2.2928528715908794, "grad_norm": 1.897912621498108, "learning_rate": 2.766127058798256e-06, "loss": 0.8742, "step": 5512 }, { "epoch": 2.2932688557834804, "grad_norm": 1.8036231994628906, "learning_rate": 2.763034389280653e-06, "loss": 0.8214, "step": 5513 }, { "epoch": 2.293684839976081, "grad_norm": 1.919982671737671, "learning_rate": 2.7599431724513283e-06, "loss": 0.8194, "step": 5514 }, { "epoch": 2.2941008241686816, "grad_norm": 13.344615936279297, "learning_rate": 2.7568534089307865e-06, "loss": 0.8556, "step": 5515 }, { "epoch": 2.294516808361282, "grad_norm": 1.9229345321655273, "learning_rate": 2.7537650993392383e-06, "loss": 0.8636, "step": 5516 }, { "epoch": 2.294932792553883, "grad_norm": 21.852275848388672, "learning_rate": 2.750678244296612e-06, "loss": 0.7522, "step": 5517 }, { "epoch": 2.2953487767464837, "grad_norm": 1.919541597366333, "learning_rate": 2.747592844422531e-06, "loss": 0.7843, "step": 5518 }, { "epoch": 2.2957647609390843, "grad_norm": 1.9965128898620605, "learning_rate": 2.744508900336338e-06, "loss": 0.8564, "step": 5519 }, { "epoch": 2.296180745131685, "grad_norm": 1.809077501296997, "learning_rate": 2.741426412657072e-06, "loss": 0.8256, "step": 5520 }, { "epoch": 2.2965967293242855, "grad_norm": 1.7407242059707642, "learning_rate": 2.73834538200349e-06, "loss": 0.708, "step": 5521 }, { "epoch": 2.2970127135168865, "grad_norm": 1.761285662651062, "learning_rate": 2.7352658089940554e-06, "loss": 0.8309, "step": 5522 }, { "epoch": 2.297428697709487, "grad_norm": 1.8484400510787964, "learning_rate": 2.7321876942469295e-06, "loss": 0.832, "step": 5523 }, { "epoch": 2.2978446819020877, "grad_norm": 1.921112298965454, "learning_rate": 2.7291110383799937e-06, "loss": 0.8289, "step": 5524 }, { "epoch": 2.2982606660946883, "grad_norm": 139.20533752441406, "learning_rate": 2.726035842010828e-06, "loss": 0.8367, "step": 5525 }, { "epoch": 2.2986766502872893, "grad_norm": 1.9569592475891113, "learning_rate": 2.7229621057567203e-06, "loss": 0.7991, "step": 5526 }, { "epoch": 2.29909263447989, "grad_norm": 2.0512964725494385, "learning_rate": 2.7198898302346643e-06, "loss": 0.8815, "step": 5527 }, { "epoch": 2.2995086186724905, "grad_norm": 1.7670644521713257, "learning_rate": 2.7168190160613704e-06, "loss": 0.7987, "step": 5528 }, { "epoch": 2.299924602865091, "grad_norm": 1.8735378980636597, "learning_rate": 2.7137496638532402e-06, "loss": 0.8696, "step": 5529 }, { "epoch": 2.3003405870576916, "grad_norm": 1.9722179174423218, "learning_rate": 2.710681774226398e-06, "loss": 0.7697, "step": 5530 }, { "epoch": 2.3007565712502926, "grad_norm": 1.7734421491622925, "learning_rate": 2.7076153477966595e-06, "loss": 0.7094, "step": 5531 }, { "epoch": 2.3011725554428932, "grad_norm": 1.8699805736541748, "learning_rate": 2.7045503851795585e-06, "loss": 0.832, "step": 5532 }, { "epoch": 2.301588539635494, "grad_norm": 1.8923965692520142, "learning_rate": 2.7014868869903255e-06, "loss": 0.8248, "step": 5533 }, { "epoch": 2.3020045238280944, "grad_norm": 2.0091300010681152, "learning_rate": 2.698424853843907e-06, "loss": 0.8167, "step": 5534 }, { "epoch": 2.3024205080206954, "grad_norm": 1.924997091293335, "learning_rate": 2.6953642863549478e-06, "loss": 0.6811, "step": 5535 }, { "epoch": 2.302836492213296, "grad_norm": 2.70516037940979, "learning_rate": 2.6923051851378014e-06, "loss": 0.8019, "step": 5536 }, { "epoch": 2.3032524764058966, "grad_norm": 1.9340845346450806, "learning_rate": 2.689247550806521e-06, "loss": 0.828, "step": 5537 }, { "epoch": 2.303668460598497, "grad_norm": 1.8515865802764893, "learning_rate": 2.6861913839748743e-06, "loss": 0.83, "step": 5538 }, { "epoch": 2.3040844447910978, "grad_norm": 1.7126187086105347, "learning_rate": 2.6831366852563356e-06, "loss": 0.7387, "step": 5539 }, { "epoch": 2.304500428983699, "grad_norm": 1.857724666595459, "learning_rate": 2.680083455264072e-06, "loss": 0.8028, "step": 5540 }, { "epoch": 2.3049164131762994, "grad_norm": 1.9613382816314697, "learning_rate": 2.6770316946109707e-06, "loss": 0.8828, "step": 5541 }, { "epoch": 2.3053323973689, "grad_norm": 1.8146551847457886, "learning_rate": 2.673981403909609e-06, "loss": 0.8594, "step": 5542 }, { "epoch": 2.3057483815615005, "grad_norm": 1.8830068111419678, "learning_rate": 2.6709325837722855e-06, "loss": 0.7329, "step": 5543 }, { "epoch": 2.3061643657541016, "grad_norm": 1.933727741241455, "learning_rate": 2.6678852348109895e-06, "loss": 0.7373, "step": 5544 }, { "epoch": 2.306580349946702, "grad_norm": 1.907900333404541, "learning_rate": 2.664839357637421e-06, "loss": 0.7042, "step": 5545 }, { "epoch": 2.3069963341393027, "grad_norm": 1.783349871635437, "learning_rate": 2.6617949528629826e-06, "loss": 0.7835, "step": 5546 }, { "epoch": 2.3074123183319033, "grad_norm": 2.009251356124878, "learning_rate": 2.658752021098787e-06, "loss": 0.8811, "step": 5547 }, { "epoch": 2.307828302524504, "grad_norm": 1.8755844831466675, "learning_rate": 2.655710562955641e-06, "loss": 0.8731, "step": 5548 }, { "epoch": 2.308244286717105, "grad_norm": 1.782192349433899, "learning_rate": 2.65267057904407e-06, "loss": 0.8192, "step": 5549 }, { "epoch": 2.3086602709097055, "grad_norm": 1.857275128364563, "learning_rate": 2.6496320699742863e-06, "loss": 0.7108, "step": 5550 }, { "epoch": 2.309076255102306, "grad_norm": 1.8572232723236084, "learning_rate": 2.6465950363562222e-06, "loss": 0.7974, "step": 5551 }, { "epoch": 2.3094922392949067, "grad_norm": 107.54161071777344, "learning_rate": 2.6435594787995e-06, "loss": 0.7806, "step": 5552 }, { "epoch": 2.3099082234875077, "grad_norm": 1.9162112474441528, "learning_rate": 2.6405253979134595e-06, "loss": 0.7196, "step": 5553 }, { "epoch": 2.3103242076801083, "grad_norm": 1.8116505146026611, "learning_rate": 2.6374927943071325e-06, "loss": 0.8317, "step": 5554 }, { "epoch": 2.310740191872709, "grad_norm": 19.846908569335938, "learning_rate": 2.6344616685892565e-06, "loss": 0.7445, "step": 5555 }, { "epoch": 2.3111561760653094, "grad_norm": 2.186486005783081, "learning_rate": 2.6314320213682797e-06, "loss": 0.8679, "step": 5556 }, { "epoch": 2.31157216025791, "grad_norm": 1.7485262155532837, "learning_rate": 2.6284038532523426e-06, "loss": 0.715, "step": 5557 }, { "epoch": 2.311988144450511, "grad_norm": 19.11493492126465, "learning_rate": 2.625377164849301e-06, "loss": 0.8759, "step": 5558 }, { "epoch": 2.3124041286431116, "grad_norm": 1.8778358697891235, "learning_rate": 2.622351956766701e-06, "loss": 0.8278, "step": 5559 }, { "epoch": 2.312820112835712, "grad_norm": 30.568531036376953, "learning_rate": 2.619328229611803e-06, "loss": 0.8654, "step": 5560 }, { "epoch": 2.313236097028313, "grad_norm": 2.761272668838501, "learning_rate": 2.6163059839915593e-06, "loss": 0.8684, "step": 5561 }, { "epoch": 2.313652081220914, "grad_norm": 1.9625762701034546, "learning_rate": 2.613285220512636e-06, "loss": 0.8313, "step": 5562 }, { "epoch": 2.3140680654135144, "grad_norm": 1.7167706489562988, "learning_rate": 2.6102659397813913e-06, "loss": 0.7219, "step": 5563 }, { "epoch": 2.314484049606115, "grad_norm": 1.8167208433151245, "learning_rate": 2.6072481424038943e-06, "loss": 0.7655, "step": 5564 }, { "epoch": 2.3149000337987156, "grad_norm": 30.965681076049805, "learning_rate": 2.6042318289859114e-06, "loss": 0.8374, "step": 5565 }, { "epoch": 2.315316017991316, "grad_norm": 1.8600068092346191, "learning_rate": 2.6012170001329118e-06, "loss": 0.8748, "step": 5566 }, { "epoch": 2.315732002183917, "grad_norm": 1.9817309379577637, "learning_rate": 2.5982036564500634e-06, "loss": 0.8522, "step": 5567 }, { "epoch": 2.3161479863765178, "grad_norm": 1.7884087562561035, "learning_rate": 2.5951917985422436e-06, "loss": 0.7283, "step": 5568 }, { "epoch": 2.3165639705691183, "grad_norm": 1.8122155666351318, "learning_rate": 2.592181427014031e-06, "loss": 0.7231, "step": 5569 }, { "epoch": 2.316979954761719, "grad_norm": 2.0331709384918213, "learning_rate": 2.5891725424696957e-06, "loss": 0.7757, "step": 5570 }, { "epoch": 2.31739593895432, "grad_norm": 1.8223741054534912, "learning_rate": 2.586165145513223e-06, "loss": 0.7725, "step": 5571 }, { "epoch": 2.3178119231469205, "grad_norm": 1.781233310699463, "learning_rate": 2.583159236748285e-06, "loss": 0.7467, "step": 5572 }, { "epoch": 2.318227907339521, "grad_norm": 1.6684130430221558, "learning_rate": 2.5801548167782707e-06, "loss": 0.6564, "step": 5573 }, { "epoch": 2.3186438915321217, "grad_norm": 1.7605706453323364, "learning_rate": 2.5771518862062595e-06, "loss": 0.7515, "step": 5574 }, { "epoch": 2.3190598757247223, "grad_norm": 1.7473546266555786, "learning_rate": 2.574150445635033e-06, "loss": 0.7194, "step": 5575 }, { "epoch": 2.3194758599173233, "grad_norm": 1.8842352628707886, "learning_rate": 2.571150495667073e-06, "loss": 0.74, "step": 5576 }, { "epoch": 2.319891844109924, "grad_norm": 2.0103647708892822, "learning_rate": 2.568152036904571e-06, "loss": 0.8433, "step": 5577 }, { "epoch": 2.3203078283025245, "grad_norm": 1.957234501838684, "learning_rate": 2.565155069949408e-06, "loss": 0.814, "step": 5578 }, { "epoch": 2.320723812495125, "grad_norm": 1.796357274055481, "learning_rate": 2.562159595403173e-06, "loss": 0.9048, "step": 5579 }, { "epoch": 2.321139796687726, "grad_norm": 1.8539451360702515, "learning_rate": 2.5591656138671484e-06, "loss": 0.8316, "step": 5580 }, { "epoch": 2.3215557808803267, "grad_norm": 1.7594629526138306, "learning_rate": 2.556173125942327e-06, "loss": 0.737, "step": 5581 }, { "epoch": 2.3219717650729272, "grad_norm": 1.7686383724212646, "learning_rate": 2.5531821322293913e-06, "loss": 0.8156, "step": 5582 }, { "epoch": 2.322387749265528, "grad_norm": 1.8981224298477173, "learning_rate": 2.550192633328732e-06, "loss": 0.7028, "step": 5583 }, { "epoch": 2.3228037334581284, "grad_norm": 1.8737431764602661, "learning_rate": 2.547204629840435e-06, "loss": 0.834, "step": 5584 }, { "epoch": 2.3232197176507294, "grad_norm": 1.9067986011505127, "learning_rate": 2.5442181223642835e-06, "loss": 0.7323, "step": 5585 }, { "epoch": 2.32363570184333, "grad_norm": 1.9065104722976685, "learning_rate": 2.5412331114997713e-06, "loss": 0.793, "step": 5586 }, { "epoch": 2.3240516860359306, "grad_norm": 1.893757700920105, "learning_rate": 2.5382495978460774e-06, "loss": 0.7749, "step": 5587 }, { "epoch": 2.324467670228531, "grad_norm": 1.7119553089141846, "learning_rate": 2.5352675820020935e-06, "loss": 0.7872, "step": 5588 }, { "epoch": 2.324883654421132, "grad_norm": 1.9967262744903564, "learning_rate": 2.532287064566399e-06, "loss": 0.9238, "step": 5589 }, { "epoch": 2.325299638613733, "grad_norm": 1.8299704790115356, "learning_rate": 2.5293080461372844e-06, "loss": 0.8385, "step": 5590 }, { "epoch": 2.3257156228063334, "grad_norm": 1.8548516035079956, "learning_rate": 2.526330527312727e-06, "loss": 0.786, "step": 5591 }, { "epoch": 2.326131606998934, "grad_norm": 57.65816879272461, "learning_rate": 2.5233545086904144e-06, "loss": 0.7072, "step": 5592 }, { "epoch": 2.3265475911915345, "grad_norm": 2.018278121948242, "learning_rate": 2.520379990867726e-06, "loss": 0.8834, "step": 5593 }, { "epoch": 2.3269635753841356, "grad_norm": 1.9417343139648438, "learning_rate": 2.5174069744417416e-06, "loss": 0.7134, "step": 5594 }, { "epoch": 2.327379559576736, "grad_norm": 2.09743070602417, "learning_rate": 2.514435460009236e-06, "loss": 0.7276, "step": 5595 }, { "epoch": 2.3277955437693367, "grad_norm": 1.9250322580337524, "learning_rate": 2.5114654481666923e-06, "loss": 0.8769, "step": 5596 }, { "epoch": 2.3282115279619373, "grad_norm": 29.474872589111328, "learning_rate": 2.508496939510281e-06, "loss": 0.8612, "step": 5597 }, { "epoch": 2.3286275121545383, "grad_norm": 2.10711407661438, "learning_rate": 2.5055299346358786e-06, "loss": 0.7625, "step": 5598 }, { "epoch": 2.329043496347139, "grad_norm": 1.9775099754333496, "learning_rate": 2.5025644341390586e-06, "loss": 0.791, "step": 5599 }, { "epoch": 2.3294594805397395, "grad_norm": 1.8992786407470703, "learning_rate": 2.4996004386150864e-06, "loss": 0.6936, "step": 5600 }, { "epoch": 2.32987546473234, "grad_norm": 1.9470515251159668, "learning_rate": 2.4966379486589353e-06, "loss": 0.8247, "step": 5601 }, { "epoch": 2.3302914489249407, "grad_norm": 1.7397925853729248, "learning_rate": 2.4936769648652647e-06, "loss": 0.8459, "step": 5602 }, { "epoch": 2.3307074331175417, "grad_norm": 2.073828935623169, "learning_rate": 2.4907174878284445e-06, "loss": 0.8561, "step": 5603 }, { "epoch": 2.3311234173101423, "grad_norm": 1.8640216588974, "learning_rate": 2.487759518142532e-06, "loss": 0.7282, "step": 5604 }, { "epoch": 2.331539401502743, "grad_norm": 1.9374058246612549, "learning_rate": 2.484803056401285e-06, "loss": 0.8567, "step": 5605 }, { "epoch": 2.3319553856953434, "grad_norm": 1.7429869174957275, "learning_rate": 2.481848103198157e-06, "loss": 0.6227, "step": 5606 }, { "epoch": 2.3323713698879445, "grad_norm": 2.016993761062622, "learning_rate": 2.4788946591263065e-06, "loss": 0.8545, "step": 5607 }, { "epoch": 2.332787354080545, "grad_norm": 1.9157263040542603, "learning_rate": 2.475942724778576e-06, "loss": 0.7857, "step": 5608 }, { "epoch": 2.3332033382731456, "grad_norm": 1.836721420288086, "learning_rate": 2.472992300747521e-06, "loss": 0.8621, "step": 5609 }, { "epoch": 2.333619322465746, "grad_norm": 1.8478254079818726, "learning_rate": 2.470043387625376e-06, "loss": 0.7989, "step": 5610 }, { "epoch": 2.334035306658347, "grad_norm": 1.7713963985443115, "learning_rate": 2.4670959860040887e-06, "loss": 0.757, "step": 5611 }, { "epoch": 2.334451290850948, "grad_norm": 1.7683746814727783, "learning_rate": 2.464150096475291e-06, "loss": 0.7458, "step": 5612 }, { "epoch": 2.3348672750435484, "grad_norm": 1.7968858480453491, "learning_rate": 2.4612057196303195e-06, "loss": 0.6759, "step": 5613 }, { "epoch": 2.335283259236149, "grad_norm": 1.9892057180404663, "learning_rate": 2.458262856060204e-06, "loss": 0.8068, "step": 5614 }, { "epoch": 2.3356992434287496, "grad_norm": 5.837474822998047, "learning_rate": 2.4553215063556656e-06, "loss": 0.7165, "step": 5615 }, { "epoch": 2.3361152276213506, "grad_norm": 1.71492600440979, "learning_rate": 2.4523816711071324e-06, "loss": 0.6559, "step": 5616 }, { "epoch": 2.336531211813951, "grad_norm": 2.9684298038482666, "learning_rate": 2.4494433509047177e-06, "loss": 0.7576, "step": 5617 }, { "epoch": 2.3369471960065518, "grad_norm": 1.9237741231918335, "learning_rate": 2.446506546338241e-06, "loss": 0.9399, "step": 5618 }, { "epoch": 2.3373631801991523, "grad_norm": 1.8459136486053467, "learning_rate": 2.4435712579972047e-06, "loss": 0.7532, "step": 5619 }, { "epoch": 2.337779164391753, "grad_norm": 1.8654330968856812, "learning_rate": 2.4406374864708214e-06, "loss": 0.8192, "step": 5620 }, { "epoch": 2.338195148584354, "grad_norm": 1.7503308057785034, "learning_rate": 2.4377052323479864e-06, "loss": 0.8031, "step": 5621 }, { "epoch": 2.3386111327769545, "grad_norm": 124.96406555175781, "learning_rate": 2.4347744962173e-06, "loss": 0.841, "step": 5622 }, { "epoch": 2.339027116969555, "grad_norm": 1.8501086235046387, "learning_rate": 2.431845278667053e-06, "loss": 0.7774, "step": 5623 }, { "epoch": 2.3394431011621557, "grad_norm": 1.9482738971710205, "learning_rate": 2.4289175802852315e-06, "loss": 0.8063, "step": 5624 }, { "epoch": 2.3398590853547567, "grad_norm": 2.0345957279205322, "learning_rate": 2.4259914016595144e-06, "loss": 0.7971, "step": 5625 }, { "epoch": 2.3402750695473573, "grad_norm": 2.0019216537475586, "learning_rate": 2.4230667433772836e-06, "loss": 0.903, "step": 5626 }, { "epoch": 2.340691053739958, "grad_norm": 1.8523801565170288, "learning_rate": 2.4201436060256057e-06, "loss": 0.7293, "step": 5627 }, { "epoch": 2.3411070379325585, "grad_norm": 1.9257975816726685, "learning_rate": 2.4172219901912497e-06, "loss": 0.8204, "step": 5628 }, { "epoch": 2.341523022125159, "grad_norm": 1.9479597806930542, "learning_rate": 2.414301896460679e-06, "loss": 0.8626, "step": 5629 }, { "epoch": 2.34193900631776, "grad_norm": 1.9681529998779297, "learning_rate": 2.4113833254200435e-06, "loss": 0.7915, "step": 5630 }, { "epoch": 2.3423549905103607, "grad_norm": 1.9725146293640137, "learning_rate": 2.408466277655198e-06, "loss": 0.84, "step": 5631 }, { "epoch": 2.3427709747029613, "grad_norm": 1.8874136209487915, "learning_rate": 2.4055507537516844e-06, "loss": 0.8409, "step": 5632 }, { "epoch": 2.343186958895562, "grad_norm": 1.8561482429504395, "learning_rate": 2.4026367542947406e-06, "loss": 0.7791, "step": 5633 }, { "epoch": 2.343602943088163, "grad_norm": 1.7872886657714844, "learning_rate": 2.3997242798692944e-06, "loss": 0.8141, "step": 5634 }, { "epoch": 2.3440189272807634, "grad_norm": 1.8972185850143433, "learning_rate": 2.3968133310599795e-06, "loss": 0.7813, "step": 5635 }, { "epoch": 2.344434911473364, "grad_norm": 1.9427827596664429, "learning_rate": 2.3939039084511084e-06, "loss": 0.9034, "step": 5636 }, { "epoch": 2.3448508956659646, "grad_norm": 1.8068604469299316, "learning_rate": 2.3909960126267006e-06, "loss": 0.7487, "step": 5637 }, { "epoch": 2.345266879858565, "grad_norm": 1.8388540744781494, "learning_rate": 2.388089644170457e-06, "loss": 0.7635, "step": 5638 }, { "epoch": 2.345682864051166, "grad_norm": 1.7803255319595337, "learning_rate": 2.3851848036657832e-06, "loss": 0.7753, "step": 5639 }, { "epoch": 2.346098848243767, "grad_norm": 1.7053344249725342, "learning_rate": 2.382281491695768e-06, "loss": 0.711, "step": 5640 }, { "epoch": 2.3465148324363674, "grad_norm": 1.8684639930725098, "learning_rate": 2.3793797088432036e-06, "loss": 0.8106, "step": 5641 }, { "epoch": 2.346930816628968, "grad_norm": 1.823306679725647, "learning_rate": 2.3764794556905637e-06, "loss": 0.8298, "step": 5642 }, { "epoch": 2.347346800821569, "grad_norm": 1.7465797662734985, "learning_rate": 2.3735807328200266e-06, "loss": 0.7464, "step": 5643 }, { "epoch": 2.3477627850141696, "grad_norm": 1.8540548086166382, "learning_rate": 2.3706835408134553e-06, "loss": 0.7559, "step": 5644 }, { "epoch": 2.34817876920677, "grad_norm": 1.7688993215560913, "learning_rate": 2.3677878802524045e-06, "loss": 0.7539, "step": 5645 }, { "epoch": 2.3485947533993707, "grad_norm": 1.7151460647583008, "learning_rate": 2.3648937517181314e-06, "loss": 0.789, "step": 5646 }, { "epoch": 2.3490107375919713, "grad_norm": 2.0705435276031494, "learning_rate": 2.3620011557915735e-06, "loss": 0.8725, "step": 5647 }, { "epoch": 2.3494267217845723, "grad_norm": 1.838765025138855, "learning_rate": 2.3591100930533716e-06, "loss": 0.754, "step": 5648 }, { "epoch": 2.349842705977173, "grad_norm": 2.0715982913970947, "learning_rate": 2.3562205640838487e-06, "loss": 0.7573, "step": 5649 }, { "epoch": 2.3502586901697735, "grad_norm": 2.042149543762207, "learning_rate": 2.353332569463029e-06, "loss": 0.8663, "step": 5650 }, { "epoch": 2.350674674362374, "grad_norm": 1.8023446798324585, "learning_rate": 2.350446109770622e-06, "loss": 0.8711, "step": 5651 }, { "epoch": 2.351090658554975, "grad_norm": 1.8460274934768677, "learning_rate": 2.3475611855860336e-06, "loss": 0.8723, "step": 5652 }, { "epoch": 2.3515066427475757, "grad_norm": 1.9666082859039307, "learning_rate": 2.3446777974883594e-06, "loss": 0.8397, "step": 5653 }, { "epoch": 2.3519226269401763, "grad_norm": 1.8109338283538818, "learning_rate": 2.3417959460563845e-06, "loss": 0.7964, "step": 5654 }, { "epoch": 2.352338611132777, "grad_norm": 1.9265506267547607, "learning_rate": 2.3389156318685868e-06, "loss": 0.8654, "step": 5655 }, { "epoch": 2.3527545953253775, "grad_norm": 2.037681818008423, "learning_rate": 2.336036855503141e-06, "loss": 0.8599, "step": 5656 }, { "epoch": 2.3531705795179785, "grad_norm": 1.7845897674560547, "learning_rate": 2.3331596175379044e-06, "loss": 0.7374, "step": 5657 }, { "epoch": 2.353586563710579, "grad_norm": 1.7687644958496094, "learning_rate": 2.3302839185504345e-06, "loss": 0.7451, "step": 5658 }, { "epoch": 2.3540025479031796, "grad_norm": 1.9114885330200195, "learning_rate": 2.327409759117969e-06, "loss": 0.722, "step": 5659 }, { "epoch": 2.3544185320957802, "grad_norm": 1.8989843130111694, "learning_rate": 2.3245371398174466e-06, "loss": 0.8372, "step": 5660 }, { "epoch": 2.3548345162883813, "grad_norm": 1.976554274559021, "learning_rate": 2.321666061225496e-06, "loss": 0.9425, "step": 5661 }, { "epoch": 2.355250500480982, "grad_norm": 1.8527144193649292, "learning_rate": 2.3187965239184297e-06, "loss": 0.7556, "step": 5662 }, { "epoch": 2.3556664846735824, "grad_norm": 1.7839701175689697, "learning_rate": 2.3159285284722555e-06, "loss": 0.7667, "step": 5663 }, { "epoch": 2.356082468866183, "grad_norm": 1.8490561246871948, "learning_rate": 2.3130620754626686e-06, "loss": 0.8129, "step": 5664 }, { "epoch": 2.3564984530587836, "grad_norm": 1.8848167657852173, "learning_rate": 2.3101971654650614e-06, "loss": 0.8758, "step": 5665 }, { "epoch": 2.3569144372513846, "grad_norm": 1.8932052850723267, "learning_rate": 2.3073337990545076e-06, "loss": 0.8402, "step": 5666 }, { "epoch": 2.357330421443985, "grad_norm": 1.9274210929870605, "learning_rate": 2.30447197680578e-06, "loss": 0.7309, "step": 5667 }, { "epoch": 2.3577464056365858, "grad_norm": 1.874631643295288, "learning_rate": 2.3016116992933325e-06, "loss": 0.8645, "step": 5668 }, { "epoch": 2.3581623898291864, "grad_norm": 1.9577217102050781, "learning_rate": 2.298752967091318e-06, "loss": 0.8439, "step": 5669 }, { "epoch": 2.3585783740217874, "grad_norm": 1.9177113771438599, "learning_rate": 2.295895780773569e-06, "loss": 0.7708, "step": 5670 }, { "epoch": 2.358994358214388, "grad_norm": 1.7890626192092896, "learning_rate": 2.2930401409136214e-06, "loss": 0.6765, "step": 5671 }, { "epoch": 2.3594103424069885, "grad_norm": 1.9827817678451538, "learning_rate": 2.290186048084686e-06, "loss": 0.7913, "step": 5672 }, { "epoch": 2.359826326599589, "grad_norm": 1.9460606575012207, "learning_rate": 2.287333502859673e-06, "loss": 0.7472, "step": 5673 }, { "epoch": 2.3602423107921897, "grad_norm": 1.8386567831039429, "learning_rate": 2.2844825058111732e-06, "loss": 0.7331, "step": 5674 }, { "epoch": 2.3606582949847907, "grad_norm": 4.994161128997803, "learning_rate": 2.2816330575114765e-06, "loss": 0.8116, "step": 5675 }, { "epoch": 2.3610742791773913, "grad_norm": 1.79104745388031, "learning_rate": 2.2787851585325595e-06, "loss": 0.795, "step": 5676 }, { "epoch": 2.361490263369992, "grad_norm": 1.8641066551208496, "learning_rate": 2.275938809446081e-06, "loss": 0.8459, "step": 5677 }, { "epoch": 2.3619062475625925, "grad_norm": 1.8575087785720825, "learning_rate": 2.2730940108233977e-06, "loss": 0.7589, "step": 5678 }, { "epoch": 2.3623222317551935, "grad_norm": 1.9741674661636353, "learning_rate": 2.2702507632355454e-06, "loss": 0.851, "step": 5679 }, { "epoch": 2.362738215947794, "grad_norm": 1.6949810981750488, "learning_rate": 2.2674090672532612e-06, "loss": 0.6929, "step": 5680 }, { "epoch": 2.3631542001403947, "grad_norm": 1.958175778388977, "learning_rate": 2.264568923446956e-06, "loss": 0.77, "step": 5681 }, { "epoch": 2.3635701843329953, "grad_norm": 1.9672596454620361, "learning_rate": 2.2617303323867436e-06, "loss": 0.7735, "step": 5682 }, { "epoch": 2.363986168525596, "grad_norm": 1.8730577230453491, "learning_rate": 2.258893294642415e-06, "loss": 0.6775, "step": 5683 }, { "epoch": 2.364402152718197, "grad_norm": 1.8237512111663818, "learning_rate": 2.256057810783455e-06, "loss": 0.7484, "step": 5684 }, { "epoch": 2.3648181369107975, "grad_norm": 1.9855204820632935, "learning_rate": 2.253223881379032e-06, "loss": 0.8317, "step": 5685 }, { "epoch": 2.365234121103398, "grad_norm": 1.9991101026535034, "learning_rate": 2.2503915069980098e-06, "loss": 0.8427, "step": 5686 }, { "epoch": 2.3656501052959986, "grad_norm": 1.815479040145874, "learning_rate": 2.247560688208931e-06, "loss": 0.8272, "step": 5687 }, { "epoch": 2.3660660894885996, "grad_norm": 1.8853473663330078, "learning_rate": 2.2447314255800357e-06, "loss": 0.7656, "step": 5688 }, { "epoch": 2.3664820736812002, "grad_norm": 1.9321120977401733, "learning_rate": 2.2419037196792414e-06, "loss": 0.6741, "step": 5689 }, { "epoch": 2.366898057873801, "grad_norm": 1.8355404138565063, "learning_rate": 2.239077571074162e-06, "loss": 0.874, "step": 5690 }, { "epoch": 2.3673140420664014, "grad_norm": 1.8459773063659668, "learning_rate": 2.2362529803320964e-06, "loss": 0.7271, "step": 5691 }, { "epoch": 2.367730026259002, "grad_norm": 1.898056983947754, "learning_rate": 2.233429948020027e-06, "loss": 0.8207, "step": 5692 }, { "epoch": 2.368146010451603, "grad_norm": 1.925300121307373, "learning_rate": 2.2306084747046254e-06, "loss": 0.776, "step": 5693 }, { "epoch": 2.3685619946442036, "grad_norm": 1.8780025243759155, "learning_rate": 2.2277885609522498e-06, "loss": 0.8466, "step": 5694 }, { "epoch": 2.368977978836804, "grad_norm": 2.120702028274536, "learning_rate": 2.2249702073289494e-06, "loss": 0.8063, "step": 5695 }, { "epoch": 2.3693939630294047, "grad_norm": 2.07495379447937, "learning_rate": 2.222153414400452e-06, "loss": 0.6787, "step": 5696 }, { "epoch": 2.3698099472220058, "grad_norm": 1.8472245931625366, "learning_rate": 2.219338182732185e-06, "loss": 0.714, "step": 5697 }, { "epoch": 2.3702259314146064, "grad_norm": 1.7923667430877686, "learning_rate": 2.2165245128892455e-06, "loss": 0.7687, "step": 5698 }, { "epoch": 2.370641915607207, "grad_norm": 1.8047544956207275, "learning_rate": 2.2137124054364347e-06, "loss": 0.897, "step": 5699 }, { "epoch": 2.3710578997998075, "grad_norm": 1.9915341138839722, "learning_rate": 2.210901860938224e-06, "loss": 0.8215, "step": 5700 }, { "epoch": 2.371473883992408, "grad_norm": 1.8228654861450195, "learning_rate": 2.2080928799587864e-06, "loss": 0.8025, "step": 5701 }, { "epoch": 2.371889868185009, "grad_norm": 1.773558259010315, "learning_rate": 2.205285463061968e-06, "loss": 0.8133, "step": 5702 }, { "epoch": 2.3723058523776097, "grad_norm": 1.8249578475952148, "learning_rate": 2.202479610811309e-06, "loss": 0.7782, "step": 5703 }, { "epoch": 2.3727218365702103, "grad_norm": 1.869526982307434, "learning_rate": 2.1996753237700286e-06, "loss": 0.6784, "step": 5704 }, { "epoch": 2.373137820762811, "grad_norm": 1.8401612043380737, "learning_rate": 2.196872602501038e-06, "loss": 0.7843, "step": 5705 }, { "epoch": 2.373553804955412, "grad_norm": 2.035374402999878, "learning_rate": 2.1940714475669367e-06, "loss": 0.8165, "step": 5706 }, { "epoch": 2.3739697891480125, "grad_norm": 1.8953988552093506, "learning_rate": 2.1912718595299997e-06, "loss": 0.824, "step": 5707 }, { "epoch": 2.374385773340613, "grad_norm": 1.8630108833312988, "learning_rate": 2.1884738389521964e-06, "loss": 0.7836, "step": 5708 }, { "epoch": 2.3748017575332137, "grad_norm": 52.31303024291992, "learning_rate": 2.185677386395175e-06, "loss": 0.848, "step": 5709 }, { "epoch": 2.3752177417258142, "grad_norm": 1.9757421016693115, "learning_rate": 2.182882502420276e-06, "loss": 0.6572, "step": 5710 }, { "epoch": 2.3756337259184153, "grad_norm": 1.825555682182312, "learning_rate": 2.18008918758852e-06, "loss": 0.8108, "step": 5711 }, { "epoch": 2.376049710111016, "grad_norm": 1.6608250141143799, "learning_rate": 2.177297442460612e-06, "loss": 0.7997, "step": 5712 }, { "epoch": 2.3764656943036164, "grad_norm": 1.9013179540634155, "learning_rate": 2.1745072675969425e-06, "loss": 0.7155, "step": 5713 }, { "epoch": 2.376881678496217, "grad_norm": 1.9215185642242432, "learning_rate": 2.171718663557593e-06, "loss": 0.7778, "step": 5714 }, { "epoch": 2.377297662688818, "grad_norm": 1.8429315090179443, "learning_rate": 2.168931630902318e-06, "loss": 0.8701, "step": 5715 }, { "epoch": 2.3777136468814186, "grad_norm": 1.8617771863937378, "learning_rate": 2.1661461701905704e-06, "loss": 0.7945, "step": 5716 }, { "epoch": 2.378129631074019, "grad_norm": 1.970033049583435, "learning_rate": 2.1633622819814738e-06, "loss": 0.8106, "step": 5717 }, { "epoch": 2.37854561526662, "grad_norm": 1.7850371599197388, "learning_rate": 2.160579966833849e-06, "loss": 0.8504, "step": 5718 }, { "epoch": 2.3789615994592204, "grad_norm": 1.9741995334625244, "learning_rate": 2.1577992253061897e-06, "loss": 0.8308, "step": 5719 }, { "epoch": 2.3793775836518214, "grad_norm": 1.8312917947769165, "learning_rate": 2.1550200579566826e-06, "loss": 0.7953, "step": 5720 }, { "epoch": 2.379793567844422, "grad_norm": 1.939570665359497, "learning_rate": 2.1522424653431908e-06, "loss": 0.7359, "step": 5721 }, { "epoch": 2.3802095520370226, "grad_norm": 705.8114624023438, "learning_rate": 2.1494664480232694e-06, "loss": 0.7186, "step": 5722 }, { "epoch": 2.380625536229623, "grad_norm": 1.8254934549331665, "learning_rate": 2.146692006554152e-06, "loss": 0.7762, "step": 5723 }, { "epoch": 2.381041520422224, "grad_norm": 288.51458740234375, "learning_rate": 2.1439191414927527e-06, "loss": 0.8933, "step": 5724 }, { "epoch": 2.3814575046148247, "grad_norm": 1.8804749250411987, "learning_rate": 2.1411478533956787e-06, "loss": 0.7259, "step": 5725 }, { "epoch": 2.3818734888074253, "grad_norm": 1.9253941774368286, "learning_rate": 2.1383781428192107e-06, "loss": 0.7941, "step": 5726 }, { "epoch": 2.382289473000026, "grad_norm": 1.9289703369140625, "learning_rate": 2.135610010319322e-06, "loss": 0.9367, "step": 5727 }, { "epoch": 2.3827054571926265, "grad_norm": 1.8578542470932007, "learning_rate": 2.13284345645166e-06, "loss": 0.7793, "step": 5728 }, { "epoch": 2.3831214413852275, "grad_norm": 1.790634274482727, "learning_rate": 2.1300784817715648e-06, "loss": 0.7114, "step": 5729 }, { "epoch": 2.383537425577828, "grad_norm": 1.8539199829101562, "learning_rate": 2.1273150868340486e-06, "loss": 0.7755, "step": 5730 }, { "epoch": 2.3839534097704287, "grad_norm": 1.948188066482544, "learning_rate": 2.124553272193818e-06, "loss": 0.8109, "step": 5731 }, { "epoch": 2.3843693939630293, "grad_norm": 1603.5274658203125, "learning_rate": 2.121793038405254e-06, "loss": 0.8133, "step": 5732 }, { "epoch": 2.3847853781556303, "grad_norm": 1.8790805339813232, "learning_rate": 2.1190343860224226e-06, "loss": 0.8621, "step": 5733 }, { "epoch": 2.385201362348231, "grad_norm": 1.8191397190093994, "learning_rate": 2.116277315599071e-06, "loss": 0.8498, "step": 5734 }, { "epoch": 2.3856173465408315, "grad_norm": 1.8464998006820679, "learning_rate": 2.1135218276886325e-06, "loss": 0.7587, "step": 5735 }, { "epoch": 2.386033330733432, "grad_norm": 7.567612171173096, "learning_rate": 2.1107679228442234e-06, "loss": 0.7926, "step": 5736 }, { "epoch": 2.3864493149260326, "grad_norm": 1.6927770376205444, "learning_rate": 2.108015601618635e-06, "loss": 0.7982, "step": 5737 }, { "epoch": 2.3868652991186337, "grad_norm": 1.8474117517471313, "learning_rate": 2.1052648645643493e-06, "loss": 0.7451, "step": 5738 }, { "epoch": 2.3872812833112342, "grad_norm": 1.7470805644989014, "learning_rate": 2.1025157122335228e-06, "loss": 0.8511, "step": 5739 }, { "epoch": 2.387697267503835, "grad_norm": 1.9694466590881348, "learning_rate": 2.0997681451780016e-06, "loss": 0.801, "step": 5740 }, { "epoch": 2.3881132516964354, "grad_norm": 1.737947940826416, "learning_rate": 2.0970221639493084e-06, "loss": 0.756, "step": 5741 }, { "epoch": 2.3885292358890364, "grad_norm": 2.0058343410491943, "learning_rate": 2.094277769098646e-06, "loss": 0.8019, "step": 5742 }, { "epoch": 2.388945220081637, "grad_norm": 1.8894325494766235, "learning_rate": 2.091534961176901e-06, "loss": 0.7875, "step": 5743 }, { "epoch": 2.3893612042742376, "grad_norm": 22.009794235229492, "learning_rate": 2.088793740734646e-06, "loss": 0.7059, "step": 5744 }, { "epoch": 2.389777188466838, "grad_norm": 1.6950198411941528, "learning_rate": 2.0860541083221264e-06, "loss": 0.7278, "step": 5745 }, { "epoch": 2.3901931726594388, "grad_norm": 1.9063318967819214, "learning_rate": 2.083316064489277e-06, "loss": 0.7391, "step": 5746 }, { "epoch": 2.39060915685204, "grad_norm": 1.949725866317749, "learning_rate": 2.0805796097857066e-06, "loss": 0.7725, "step": 5747 }, { "epoch": 2.3910251410446404, "grad_norm": 1.8308627605438232, "learning_rate": 2.077844744760712e-06, "loss": 0.8529, "step": 5748 }, { "epoch": 2.391441125237241, "grad_norm": 1.8998899459838867, "learning_rate": 2.0751114699632622e-06, "loss": 0.856, "step": 5749 }, { "epoch": 2.3918571094298415, "grad_norm": 2.070375680923462, "learning_rate": 2.072379785942018e-06, "loss": 0.8232, "step": 5750 }, { "epoch": 2.3922730936224426, "grad_norm": 1.9891396760940552, "learning_rate": 2.0696496932453124e-06, "loss": 0.8557, "step": 5751 }, { "epoch": 2.392689077815043, "grad_norm": 2.130905866622925, "learning_rate": 2.0669211924211586e-06, "loss": 0.8949, "step": 5752 }, { "epoch": 2.3931050620076437, "grad_norm": 2.014317035675049, "learning_rate": 2.064194284017258e-06, "loss": 0.7797, "step": 5753 }, { "epoch": 2.3935210462002443, "grad_norm": 2.1712496280670166, "learning_rate": 2.061468968580983e-06, "loss": 0.8204, "step": 5754 }, { "epoch": 2.393937030392845, "grad_norm": 1.825749397277832, "learning_rate": 2.0587452466593947e-06, "loss": 0.6139, "step": 5755 }, { "epoch": 2.394353014585446, "grad_norm": 1.951874852180481, "learning_rate": 2.0560231187992265e-06, "loss": 0.8212, "step": 5756 }, { "epoch": 2.3947689987780465, "grad_norm": 1.742728352546692, "learning_rate": 2.0533025855469002e-06, "loss": 0.833, "step": 5757 }, { "epoch": 2.395184982970647, "grad_norm": 2.2251968383789062, "learning_rate": 2.0505836474485086e-06, "loss": 0.7657, "step": 5758 }, { "epoch": 2.3956009671632477, "grad_norm": 2.750389337539673, "learning_rate": 2.0478663050498326e-06, "loss": 0.7288, "step": 5759 }, { "epoch": 2.3960169513558487, "grad_norm": 2.058826446533203, "learning_rate": 2.045150558896325e-06, "loss": 0.8753, "step": 5760 }, { "epoch": 2.3964329355484493, "grad_norm": 1.774828314781189, "learning_rate": 2.042436409533127e-06, "loss": 0.7892, "step": 5761 }, { "epoch": 2.39684891974105, "grad_norm": 1.64333176612854, "learning_rate": 2.0397238575050516e-06, "loss": 0.6327, "step": 5762 }, { "epoch": 2.3972649039336504, "grad_norm": 1.8648713827133179, "learning_rate": 2.0370129033565933e-06, "loss": 0.7496, "step": 5763 }, { "epoch": 2.397680888126251, "grad_norm": 1.82632315158844, "learning_rate": 2.034303547631924e-06, "loss": 0.8155, "step": 5764 }, { "epoch": 2.398096872318852, "grad_norm": 2.0984947681427, "learning_rate": 2.0315957908749007e-06, "loss": 0.8562, "step": 5765 }, { "epoch": 2.3985128565114526, "grad_norm": 1.966728925704956, "learning_rate": 2.0288896336290585e-06, "loss": 0.8658, "step": 5766 }, { "epoch": 2.398928840704053, "grad_norm": 6.625035762786865, "learning_rate": 2.0261850764376033e-06, "loss": 0.8318, "step": 5767 }, { "epoch": 2.399344824896654, "grad_norm": 1.8188388347625732, "learning_rate": 2.0234821198434307e-06, "loss": 0.8081, "step": 5768 }, { "epoch": 2.399760809089255, "grad_norm": 1.9264189004898071, "learning_rate": 2.0207807643891044e-06, "loss": 0.7681, "step": 5769 }, { "epoch": 2.4001767932818554, "grad_norm": 1.9053528308868408, "learning_rate": 2.018081010616877e-06, "loss": 0.8158, "step": 5770 }, { "epoch": 2.400592777474456, "grad_norm": 1.8319023847579956, "learning_rate": 2.015382859068672e-06, "loss": 0.7909, "step": 5771 }, { "epoch": 2.4010087616670566, "grad_norm": 1.793485403060913, "learning_rate": 2.012686310286095e-06, "loss": 0.7582, "step": 5772 }, { "epoch": 2.401424745859657, "grad_norm": 2.087932586669922, "learning_rate": 2.0099913648104253e-06, "loss": 0.7483, "step": 5773 }, { "epoch": 2.401840730052258, "grad_norm": 1.9548392295837402, "learning_rate": 2.0072980231826287e-06, "loss": 0.7589, "step": 5774 }, { "epoch": 2.4022567142448588, "grad_norm": 1.8433881998062134, "learning_rate": 2.004606285943339e-06, "loss": 0.8053, "step": 5775 }, { "epoch": 2.4026726984374593, "grad_norm": 1.8893465995788574, "learning_rate": 2.00191615363288e-06, "loss": 0.8046, "step": 5776 }, { "epoch": 2.40308868263006, "grad_norm": 1.7428081035614014, "learning_rate": 1.999227626791239e-06, "loss": 0.9077, "step": 5777 }, { "epoch": 2.403504666822661, "grad_norm": 2.0473132133483887, "learning_rate": 1.9965407059580955e-06, "loss": 0.7906, "step": 5778 }, { "epoch": 2.4039206510152615, "grad_norm": 52.95634078979492, "learning_rate": 1.9938553916727933e-06, "loss": 0.855, "step": 5779 }, { "epoch": 2.404336635207862, "grad_norm": 1.960495948791504, "learning_rate": 1.9911716844743654e-06, "loss": 0.8211, "step": 5780 }, { "epoch": 2.4047526194004627, "grad_norm": 1.9877548217773438, "learning_rate": 1.988489584901515e-06, "loss": 0.8323, "step": 5781 }, { "epoch": 2.4051686035930633, "grad_norm": 1.817233681678772, "learning_rate": 1.985809093492621e-06, "loss": 0.6866, "step": 5782 }, { "epoch": 2.4055845877856643, "grad_norm": 1.8978546857833862, "learning_rate": 1.9831302107857486e-06, "loss": 0.9207, "step": 5783 }, { "epoch": 2.406000571978265, "grad_norm": 1.870318055152893, "learning_rate": 1.98045293731863e-06, "loss": 0.7912, "step": 5784 }, { "epoch": 2.4064165561708655, "grad_norm": 1.781423807144165, "learning_rate": 1.9777772736286814e-06, "loss": 0.8305, "step": 5785 }, { "epoch": 2.406832540363466, "grad_norm": 1.7204285860061646, "learning_rate": 1.9751032202529907e-06, "loss": 0.7716, "step": 5786 }, { "epoch": 2.407248524556067, "grad_norm": 1.7649140357971191, "learning_rate": 1.9724307777283303e-06, "loss": 0.8342, "step": 5787 }, { "epoch": 2.4076645087486677, "grad_norm": 2.131545305252075, "learning_rate": 1.9697599465911366e-06, "loss": 0.7809, "step": 5788 }, { "epoch": 2.4080804929412682, "grad_norm": 1.6280261278152466, "learning_rate": 1.9670907273775375e-06, "loss": 0.7162, "step": 5789 }, { "epoch": 2.408496477133869, "grad_norm": 2.0355827808380127, "learning_rate": 1.9644231206233267e-06, "loss": 0.834, "step": 5790 }, { "epoch": 2.4089124613264694, "grad_norm": 1.7525383234024048, "learning_rate": 1.9617571268639767e-06, "loss": 0.7901, "step": 5791 }, { "epoch": 2.4093284455190704, "grad_norm": 1.8161561489105225, "learning_rate": 1.9590927466346345e-06, "loss": 0.7184, "step": 5792 }, { "epoch": 2.409744429711671, "grad_norm": 1.8509896993637085, "learning_rate": 1.9564299804701316e-06, "loss": 0.7397, "step": 5793 }, { "epoch": 2.4101604139042716, "grad_norm": 1.9449297189712524, "learning_rate": 1.9537688289049626e-06, "loss": 0.7243, "step": 5794 }, { "epoch": 2.410576398096872, "grad_norm": 1.9171777963638306, "learning_rate": 1.9511092924733124e-06, "loss": 0.8424, "step": 5795 }, { "epoch": 2.410992382289473, "grad_norm": 1.9329164028167725, "learning_rate": 1.948451371709027e-06, "loss": 0.7388, "step": 5796 }, { "epoch": 2.411408366482074, "grad_norm": 1.8692771196365356, "learning_rate": 1.9457950671456406e-06, "loss": 0.7049, "step": 5797 }, { "epoch": 2.4118243506746744, "grad_norm": 1.9213160276412964, "learning_rate": 1.943140379316357e-06, "loss": 0.7823, "step": 5798 }, { "epoch": 2.412240334867275, "grad_norm": 1.851354956626892, "learning_rate": 1.9404873087540554e-06, "loss": 0.8505, "step": 5799 }, { "epoch": 2.4126563190598755, "grad_norm": 1.8732155561447144, "learning_rate": 1.9378358559912878e-06, "loss": 0.7363, "step": 5800 }, { "epoch": 2.4130723032524766, "grad_norm": 1.9399983882904053, "learning_rate": 1.9351860215602912e-06, "loss": 0.8308, "step": 5801 }, { "epoch": 2.413488287445077, "grad_norm": 2.0096147060394287, "learning_rate": 1.9325378059929678e-06, "loss": 0.8629, "step": 5802 }, { "epoch": 2.4139042716376777, "grad_norm": 1.939530372619629, "learning_rate": 1.9298912098208953e-06, "loss": 0.7592, "step": 5803 }, { "epoch": 2.4143202558302783, "grad_norm": 55.48633575439453, "learning_rate": 1.927246233575335e-06, "loss": 0.8158, "step": 5804 }, { "epoch": 2.4147362400228793, "grad_norm": 1.9207751750946045, "learning_rate": 1.924602877787213e-06, "loss": 0.7871, "step": 5805 }, { "epoch": 2.41515222421548, "grad_norm": 1.8697044849395752, "learning_rate": 1.921961142987139e-06, "loss": 0.7445, "step": 5806 }, { "epoch": 2.4155682084080805, "grad_norm": 1.957095980644226, "learning_rate": 1.919321029705388e-06, "loss": 0.7798, "step": 5807 }, { "epoch": 2.415984192600681, "grad_norm": 2.004873514175415, "learning_rate": 1.916682538471918e-06, "loss": 0.816, "step": 5808 }, { "epoch": 2.4164001767932817, "grad_norm": 1.8595832586288452, "learning_rate": 1.9140456698163557e-06, "loss": 0.8362, "step": 5809 }, { "epoch": 2.4168161609858827, "grad_norm": 1.8414369821548462, "learning_rate": 1.9114104242680065e-06, "loss": 0.7267, "step": 5810 }, { "epoch": 2.4172321451784833, "grad_norm": 1.8286426067352295, "learning_rate": 1.9087768023558463e-06, "loss": 0.6533, "step": 5811 }, { "epoch": 2.417648129371084, "grad_norm": 28.624162673950195, "learning_rate": 1.906144804608524e-06, "loss": 0.7452, "step": 5812 }, { "epoch": 2.4180641135636844, "grad_norm": 2.05151104927063, "learning_rate": 1.9035144315543708e-06, "loss": 0.8579, "step": 5813 }, { "epoch": 2.4184800977562855, "grad_norm": 1.78390371799469, "learning_rate": 1.9008856837213785e-06, "loss": 0.7787, "step": 5814 }, { "epoch": 2.418896081948886, "grad_norm": 1.9953840970993042, "learning_rate": 1.8982585616372274e-06, "loss": 0.8655, "step": 5815 }, { "epoch": 2.4193120661414866, "grad_norm": 1.9281421899795532, "learning_rate": 1.8956330658292576e-06, "loss": 0.8611, "step": 5816 }, { "epoch": 2.419728050334087, "grad_norm": 2.0139520168304443, "learning_rate": 1.8930091968244958e-06, "loss": 0.7767, "step": 5817 }, { "epoch": 2.420144034526688, "grad_norm": 362.3393859863281, "learning_rate": 1.890386955149629e-06, "loss": 0.6128, "step": 5818 }, { "epoch": 2.420560018719289, "grad_norm": 1.9120498895645142, "learning_rate": 1.88776634133103e-06, "loss": 0.6194, "step": 5819 }, { "epoch": 2.4209760029118894, "grad_norm": 62.44374465942383, "learning_rate": 1.8851473558947365e-06, "loss": 0.8198, "step": 5820 }, { "epoch": 2.42139198710449, "grad_norm": 1.9037909507751465, "learning_rate": 1.8825299993664615e-06, "loss": 0.8889, "step": 5821 }, { "epoch": 2.4218079712970906, "grad_norm": 1.8115694522857666, "learning_rate": 1.8799142722715891e-06, "loss": 0.7849, "step": 5822 }, { "epoch": 2.4222239554896916, "grad_norm": 1.9293231964111328, "learning_rate": 1.8773001751351838e-06, "loss": 0.9322, "step": 5823 }, { "epoch": 2.422639939682292, "grad_norm": 1.9547662734985352, "learning_rate": 1.8746877084819727e-06, "loss": 0.7939, "step": 5824 }, { "epoch": 2.4230559238748928, "grad_norm": 1.8177791833877563, "learning_rate": 1.8720768728363658e-06, "loss": 0.7683, "step": 5825 }, { "epoch": 2.4234719080674934, "grad_norm": 1.780717372894287, "learning_rate": 1.8694676687224355e-06, "loss": 0.8746, "step": 5826 }, { "epoch": 2.423887892260094, "grad_norm": 2910.120361328125, "learning_rate": 1.8668600966639328e-06, "loss": 0.8172, "step": 5827 }, { "epoch": 2.424303876452695, "grad_norm": 4.539463043212891, "learning_rate": 1.8642541571842842e-06, "loss": 0.7641, "step": 5828 }, { "epoch": 2.4247198606452955, "grad_norm": 1.8459669351577759, "learning_rate": 1.8616498508065817e-06, "loss": 0.8539, "step": 5829 }, { "epoch": 2.425135844837896, "grad_norm": 20.081262588500977, "learning_rate": 1.8590471780535924e-06, "loss": 0.7498, "step": 5830 }, { "epoch": 2.4255518290304967, "grad_norm": 1.8349742889404297, "learning_rate": 1.8564461394477506e-06, "loss": 0.8617, "step": 5831 }, { "epoch": 2.4259678132230977, "grad_norm": 1.899107575416565, "learning_rate": 1.8538467355111744e-06, "loss": 0.8493, "step": 5832 }, { "epoch": 2.4263837974156983, "grad_norm": 1.8437966108322144, "learning_rate": 1.8512489667656398e-06, "loss": 0.7885, "step": 5833 }, { "epoch": 2.426799781608299, "grad_norm": 1.9030128717422485, "learning_rate": 1.8486528337326082e-06, "loss": 0.8629, "step": 5834 }, { "epoch": 2.4272157658008995, "grad_norm": 1.82878839969635, "learning_rate": 1.8460583369331985e-06, "loss": 0.7709, "step": 5835 }, { "epoch": 2.4276317499935, "grad_norm": 1.931474208831787, "learning_rate": 1.8434654768882154e-06, "loss": 0.8905, "step": 5836 }, { "epoch": 2.428047734186101, "grad_norm": 1.844464898109436, "learning_rate": 1.8408742541181224e-06, "loss": 0.8543, "step": 5837 }, { "epoch": 2.4284637183787017, "grad_norm": 4.421690464019775, "learning_rate": 1.8382846691430645e-06, "loss": 0.8518, "step": 5838 }, { "epoch": 2.4288797025713023, "grad_norm": 1.9395278692245483, "learning_rate": 1.8356967224828482e-06, "loss": 0.9053, "step": 5839 }, { "epoch": 2.429295686763903, "grad_norm": 1.8099397420883179, "learning_rate": 1.8331104146569634e-06, "loss": 0.8513, "step": 5840 }, { "epoch": 2.429711670956504, "grad_norm": 1.8168656826019287, "learning_rate": 1.8305257461845593e-06, "loss": 0.8111, "step": 5841 }, { "epoch": 2.4301276551491044, "grad_norm": 1.9609980583190918, "learning_rate": 1.8279427175844588e-06, "loss": 0.8032, "step": 5842 }, { "epoch": 2.430543639341705, "grad_norm": 1.7713558673858643, "learning_rate": 1.8253613293751637e-06, "loss": 0.8238, "step": 5843 }, { "epoch": 2.4309596235343056, "grad_norm": 1.8414809703826904, "learning_rate": 1.822781582074834e-06, "loss": 0.7043, "step": 5844 }, { "epoch": 2.431375607726906, "grad_norm": 1.8920536041259766, "learning_rate": 1.820203476201312e-06, "loss": 0.8637, "step": 5845 }, { "epoch": 2.4317915919195072, "grad_norm": 1.7581104040145874, "learning_rate": 1.8176270122721008e-06, "loss": 0.7058, "step": 5846 }, { "epoch": 2.432207576112108, "grad_norm": 21.941661834716797, "learning_rate": 1.8150521908043827e-06, "loss": 0.7116, "step": 5847 }, { "epoch": 2.4326235603047084, "grad_norm": 71.4819107055664, "learning_rate": 1.8124790123150028e-06, "loss": 0.9298, "step": 5848 }, { "epoch": 2.433039544497309, "grad_norm": 1.933503270149231, "learning_rate": 1.8099074773204816e-06, "loss": 0.8018, "step": 5849 }, { "epoch": 2.43345552868991, "grad_norm": 1.914438247680664, "learning_rate": 1.807337586337007e-06, "loss": 0.7744, "step": 5850 }, { "epoch": 2.4338715128825106, "grad_norm": 1.7909854650497437, "learning_rate": 1.8047693398804367e-06, "loss": 0.7306, "step": 5851 }, { "epoch": 2.434287497075111, "grad_norm": 2.027273178100586, "learning_rate": 1.8022027384662966e-06, "loss": 0.7136, "step": 5852 }, { "epoch": 2.4347034812677117, "grad_norm": 1.8529391288757324, "learning_rate": 1.799637782609791e-06, "loss": 0.7321, "step": 5853 }, { "epoch": 2.4351194654603123, "grad_norm": 1.7882040739059448, "learning_rate": 1.7970744728257826e-06, "loss": 0.8044, "step": 5854 }, { "epoch": 2.4355354496529134, "grad_norm": 1.8787497282028198, "learning_rate": 1.7945128096288122e-06, "loss": 0.821, "step": 5855 }, { "epoch": 2.435951433845514, "grad_norm": 1.9876259565353394, "learning_rate": 1.791952793533084e-06, "loss": 0.8335, "step": 5856 }, { "epoch": 2.4363674180381145, "grad_norm": 2.010343313217163, "learning_rate": 1.7893944250524754e-06, "loss": 0.7717, "step": 5857 }, { "epoch": 2.436783402230715, "grad_norm": 7.838122844696045, "learning_rate": 1.7868377047005358e-06, "loss": 0.7067, "step": 5858 }, { "epoch": 2.437199386423316, "grad_norm": 1.9889507293701172, "learning_rate": 1.784282632990475e-06, "loss": 0.7637, "step": 5859 }, { "epoch": 2.4376153706159167, "grad_norm": 2.229525089263916, "learning_rate": 1.7817292104351802e-06, "loss": 0.9255, "step": 5860 }, { "epoch": 2.4380313548085173, "grad_norm": 2.0518202781677246, "learning_rate": 1.7791774375471982e-06, "loss": 0.7925, "step": 5861 }, { "epoch": 2.438447339001118, "grad_norm": 1.8420758247375488, "learning_rate": 1.7766273148387592e-06, "loss": 0.6888, "step": 5862 }, { "epoch": 2.4388633231937185, "grad_norm": 1.9223651885986328, "learning_rate": 1.7740788428217458e-06, "loss": 0.7787, "step": 5863 }, { "epoch": 2.4392793073863195, "grad_norm": 1.9327324628829956, "learning_rate": 1.7715320220077237e-06, "loss": 0.8608, "step": 5864 }, { "epoch": 2.43969529157892, "grad_norm": 1.666601300239563, "learning_rate": 1.7689868529079135e-06, "loss": 0.6927, "step": 5865 }, { "epoch": 2.4401112757715206, "grad_norm": 1.9138338565826416, "learning_rate": 1.766443336033219e-06, "loss": 0.7387, "step": 5866 }, { "epoch": 2.4405272599641212, "grad_norm": 1.815604329109192, "learning_rate": 1.7639014718941982e-06, "loss": 0.7918, "step": 5867 }, { "epoch": 2.4409432441567223, "grad_norm": 1.8880245685577393, "learning_rate": 1.7613612610010888e-06, "loss": 0.7597, "step": 5868 }, { "epoch": 2.441359228349323, "grad_norm": 2.066555976867676, "learning_rate": 1.7588227038637895e-06, "loss": 0.7286, "step": 5869 }, { "epoch": 2.4417752125419234, "grad_norm": 1.9174362421035767, "learning_rate": 1.7562858009918681e-06, "loss": 0.7825, "step": 5870 }, { "epoch": 2.442191196734524, "grad_norm": 1.9176067113876343, "learning_rate": 1.7537505528945598e-06, "loss": 0.7446, "step": 5871 }, { "epoch": 2.4426071809271246, "grad_norm": 1.8397371768951416, "learning_rate": 1.7512169600807726e-06, "loss": 0.8371, "step": 5872 }, { "epoch": 2.4430231651197256, "grad_norm": 1.755995750427246, "learning_rate": 1.7486850230590791e-06, "loss": 0.6752, "step": 5873 }, { "epoch": 2.443439149312326, "grad_norm": 2.135610580444336, "learning_rate": 1.7461547423377157e-06, "loss": 0.8228, "step": 5874 }, { "epoch": 2.443855133504927, "grad_norm": 400.9678039550781, "learning_rate": 1.743626118424595e-06, "loss": 0.775, "step": 5875 }, { "epoch": 2.4442711176975274, "grad_norm": 2.0317771434783936, "learning_rate": 1.7410991518272868e-06, "loss": 0.7645, "step": 5876 }, { "epoch": 2.4446871018901284, "grad_norm": 1.965739130973816, "learning_rate": 1.7385738430530374e-06, "loss": 0.8578, "step": 5877 }, { "epoch": 2.445103086082729, "grad_norm": 1.8013076782226562, "learning_rate": 1.7360501926087547e-06, "loss": 0.7315, "step": 5878 }, { "epoch": 2.4455190702753296, "grad_norm": 1.9358452558517456, "learning_rate": 1.7335282010010134e-06, "loss": 0.7347, "step": 5879 }, { "epoch": 2.44593505446793, "grad_norm": 1.8765608072280884, "learning_rate": 1.731007868736061e-06, "loss": 0.8414, "step": 5880 }, { "epoch": 2.4463510386605307, "grad_norm": 1.9210070371627808, "learning_rate": 1.7284891963198047e-06, "loss": 0.8128, "step": 5881 }, { "epoch": 2.4467670228531317, "grad_norm": 1.6780015230178833, "learning_rate": 1.7259721842578226e-06, "loss": 0.808, "step": 5882 }, { "epoch": 2.4471830070457323, "grad_norm": 1.7158088684082031, "learning_rate": 1.7234568330553603e-06, "loss": 0.653, "step": 5883 }, { "epoch": 2.447598991238333, "grad_norm": 1.8321913480758667, "learning_rate": 1.7209431432173252e-06, "loss": 0.7998, "step": 5884 }, { "epoch": 2.4480149754309335, "grad_norm": 2.0085854530334473, "learning_rate": 1.718431115248299e-06, "loss": 0.806, "step": 5885 }, { "epoch": 2.4484309596235345, "grad_norm": 1.8514502048492432, "learning_rate": 1.7159207496525209e-06, "loss": 0.849, "step": 5886 }, { "epoch": 2.448846943816135, "grad_norm": 1.9291861057281494, "learning_rate": 1.7134120469339022e-06, "loss": 0.8429, "step": 5887 }, { "epoch": 2.4492629280087357, "grad_norm": 2.623164415359497, "learning_rate": 1.710905007596022e-06, "loss": 0.8013, "step": 5888 }, { "epoch": 2.4496789122013363, "grad_norm": 1.9415605068206787, "learning_rate": 1.7083996321421215e-06, "loss": 0.8169, "step": 5889 }, { "epoch": 2.450094896393937, "grad_norm": 36.69732666015625, "learning_rate": 1.705895921075107e-06, "loss": 0.7564, "step": 5890 }, { "epoch": 2.450510880586538, "grad_norm": 1.703185796737671, "learning_rate": 1.7033938748975499e-06, "loss": 0.7734, "step": 5891 }, { "epoch": 2.4509268647791385, "grad_norm": 1.9953463077545166, "learning_rate": 1.7008934941116972e-06, "loss": 0.7668, "step": 5892 }, { "epoch": 2.451342848971739, "grad_norm": 1.8223304748535156, "learning_rate": 1.6983947792194477e-06, "loss": 0.7934, "step": 5893 }, { "epoch": 2.4517588331643396, "grad_norm": 1.95516836643219, "learning_rate": 1.6958977307223789e-06, "loss": 0.7985, "step": 5894 }, { "epoch": 2.4521748173569407, "grad_norm": 1.9775818586349487, "learning_rate": 1.693402349121721e-06, "loss": 0.8058, "step": 5895 }, { "epoch": 2.4525908015495412, "grad_norm": 1.9256188869476318, "learning_rate": 1.6909086349183823e-06, "loss": 0.7831, "step": 5896 }, { "epoch": 2.453006785742142, "grad_norm": 1.930951476097107, "learning_rate": 1.6884165886129256e-06, "loss": 0.8187, "step": 5897 }, { "epoch": 2.4534227699347424, "grad_norm": 2.011241912841797, "learning_rate": 1.6859262107055884e-06, "loss": 0.781, "step": 5898 }, { "epoch": 2.453838754127343, "grad_norm": 1.8069932460784912, "learning_rate": 1.6834375016962645e-06, "loss": 0.7932, "step": 5899 }, { "epoch": 2.454254738319944, "grad_norm": 1.8689264059066772, "learning_rate": 1.6809504620845173e-06, "loss": 0.8691, "step": 5900 }, { "epoch": 2.4546707225125446, "grad_norm": 1.864308476448059, "learning_rate": 1.6784650923695733e-06, "loss": 0.7785, "step": 5901 }, { "epoch": 2.455086706705145, "grad_norm": 2.033404588699341, "learning_rate": 1.6759813930503289e-06, "loss": 0.8549, "step": 5902 }, { "epoch": 2.4555026908977458, "grad_norm": 1.7941018342971802, "learning_rate": 1.6734993646253362e-06, "loss": 0.8422, "step": 5903 }, { "epoch": 2.455918675090347, "grad_norm": 1.939213752746582, "learning_rate": 1.671019007592819e-06, "loss": 0.7779, "step": 5904 }, { "epoch": 2.4563346592829474, "grad_norm": 1.8757344484329224, "learning_rate": 1.6685403224506668e-06, "loss": 0.8074, "step": 5905 }, { "epoch": 2.456750643475548, "grad_norm": 1.6874346733093262, "learning_rate": 1.6660633096964252e-06, "loss": 0.7163, "step": 5906 }, { "epoch": 2.4571666276681485, "grad_norm": 1.9419021606445312, "learning_rate": 1.663587969827314e-06, "loss": 0.7967, "step": 5907 }, { "epoch": 2.457582611860749, "grad_norm": 1.8850096464157104, "learning_rate": 1.66111430334021e-06, "loss": 0.9262, "step": 5908 }, { "epoch": 2.45799859605335, "grad_norm": 1.8528001308441162, "learning_rate": 1.6586423107316553e-06, "loss": 0.6922, "step": 5909 }, { "epoch": 2.4584145802459507, "grad_norm": 1.8612565994262695, "learning_rate": 1.6561719924978559e-06, "loss": 0.8686, "step": 5910 }, { "epoch": 2.4588305644385513, "grad_norm": 2.0236847400665283, "learning_rate": 1.6537033491346877e-06, "loss": 0.7892, "step": 5911 }, { "epoch": 2.459246548631152, "grad_norm": 2.1827943325042725, "learning_rate": 1.6512363811376809e-06, "loss": 0.7698, "step": 5912 }, { "epoch": 2.459662532823753, "grad_norm": 1.6327645778656006, "learning_rate": 1.648771089002038e-06, "loss": 0.6286, "step": 5913 }, { "epoch": 2.4600785170163535, "grad_norm": 1.9842716455459595, "learning_rate": 1.6463074732226182e-06, "loss": 0.8297, "step": 5914 }, { "epoch": 2.460494501208954, "grad_norm": 2.0589168071746826, "learning_rate": 1.6438455342939497e-06, "loss": 0.9087, "step": 5915 }, { "epoch": 2.4609104854015547, "grad_norm": 1.95145845413208, "learning_rate": 1.6413852727102187e-06, "loss": 0.8275, "step": 5916 }, { "epoch": 2.4613264695941552, "grad_norm": 1.8431843519210815, "learning_rate": 1.6389266889652823e-06, "loss": 0.7492, "step": 5917 }, { "epoch": 2.4617424537867563, "grad_norm": 1.7773643732070923, "learning_rate": 1.6364697835526523e-06, "loss": 0.7134, "step": 5918 }, { "epoch": 2.462158437979357, "grad_norm": 1.8515514135360718, "learning_rate": 1.6340145569655064e-06, "loss": 0.8081, "step": 5919 }, { "epoch": 2.4625744221719574, "grad_norm": 1.896725058555603, "learning_rate": 1.6315610096966904e-06, "loss": 0.7786, "step": 5920 }, { "epoch": 2.462990406364558, "grad_norm": 1.9368304014205933, "learning_rate": 1.629109142238705e-06, "loss": 0.8202, "step": 5921 }, { "epoch": 2.463406390557159, "grad_norm": 1.7264468669891357, "learning_rate": 1.626658955083722e-06, "loss": 0.7474, "step": 5922 }, { "epoch": 2.4638223747497596, "grad_norm": 1.9867507219314575, "learning_rate": 1.6242104487235665e-06, "loss": 0.7983, "step": 5923 }, { "epoch": 2.46423835894236, "grad_norm": 2.1136157512664795, "learning_rate": 1.621763623649737e-06, "loss": 0.8535, "step": 5924 }, { "epoch": 2.464654343134961, "grad_norm": 1.951823353767395, "learning_rate": 1.6193184803533824e-06, "loss": 0.8306, "step": 5925 }, { "epoch": 2.4650703273275614, "grad_norm": 3.524257183074951, "learning_rate": 1.6168750193253268e-06, "loss": 0.7543, "step": 5926 }, { "epoch": 2.4654863115201624, "grad_norm": 1.939868450164795, "learning_rate": 1.6144332410560447e-06, "loss": 0.7557, "step": 5927 }, { "epoch": 2.465902295712763, "grad_norm": 1.8511128425598145, "learning_rate": 1.6119931460356841e-06, "loss": 0.7233, "step": 5928 }, { "epoch": 2.4663182799053636, "grad_norm": 1.792847990989685, "learning_rate": 1.6095547347540452e-06, "loss": 0.9128, "step": 5929 }, { "epoch": 2.466734264097964, "grad_norm": 1.8011181354522705, "learning_rate": 1.6071180077005967e-06, "loss": 0.837, "step": 5930 }, { "epoch": 2.467150248290565, "grad_norm": 1.8703693151474, "learning_rate": 1.6046829653644624e-06, "loss": 0.7207, "step": 5931 }, { "epoch": 2.4675662324831658, "grad_norm": 10.551868438720703, "learning_rate": 1.6022496082344384e-06, "loss": 0.8437, "step": 5932 }, { "epoch": 2.4679822166757663, "grad_norm": 1.7787542343139648, "learning_rate": 1.5998179367989708e-06, "loss": 0.9543, "step": 5933 }, { "epoch": 2.468398200868367, "grad_norm": 2.0524230003356934, "learning_rate": 1.5973879515461765e-06, "loss": 0.8403, "step": 5934 }, { "epoch": 2.4688141850609675, "grad_norm": 1.9579474925994873, "learning_rate": 1.5949596529638334e-06, "loss": 0.8439, "step": 5935 }, { "epoch": 2.4692301692535685, "grad_norm": 1.8658406734466553, "learning_rate": 1.5925330415393714e-06, "loss": 0.7988, "step": 5936 }, { "epoch": 2.469646153446169, "grad_norm": 2.4969193935394287, "learning_rate": 1.5901081177598944e-06, "loss": 0.8506, "step": 5937 }, { "epoch": 2.4700621376387697, "grad_norm": 1.9308706521987915, "learning_rate": 1.5876848821121582e-06, "loss": 0.9106, "step": 5938 }, { "epoch": 2.4704781218313703, "grad_norm": 1.8755697011947632, "learning_rate": 1.5852633350825842e-06, "loss": 0.7691, "step": 5939 }, { "epoch": 2.4708941060239713, "grad_norm": 1.8525477647781372, "learning_rate": 1.58284347715725e-06, "loss": 0.7105, "step": 5940 }, { "epoch": 2.471310090216572, "grad_norm": 1.9433568716049194, "learning_rate": 1.5804253088219024e-06, "loss": 0.8053, "step": 5941 }, { "epoch": 2.4717260744091725, "grad_norm": 2.009983539581299, "learning_rate": 1.5780088305619412e-06, "loss": 0.8223, "step": 5942 }, { "epoch": 2.472142058601773, "grad_norm": 2.0281684398651123, "learning_rate": 1.5755940428624328e-06, "loss": 0.7344, "step": 5943 }, { "epoch": 2.4725580427943736, "grad_norm": 1.9958575963974, "learning_rate": 1.5731809462080982e-06, "loss": 0.852, "step": 5944 }, { "epoch": 2.4729740269869747, "grad_norm": 1.849241018295288, "learning_rate": 1.570769541083328e-06, "loss": 0.849, "step": 5945 }, { "epoch": 2.4733900111795752, "grad_norm": 46.00719451904297, "learning_rate": 1.5683598279721613e-06, "loss": 0.6602, "step": 5946 }, { "epoch": 2.473805995372176, "grad_norm": 1.8271880149841309, "learning_rate": 1.565951807358308e-06, "loss": 0.7684, "step": 5947 }, { "epoch": 2.4742219795647764, "grad_norm": 1.8795543909072876, "learning_rate": 1.5635454797251337e-06, "loss": 0.8556, "step": 5948 }, { "epoch": 2.4746379637573774, "grad_norm": 1.823711633682251, "learning_rate": 1.5611408455556608e-06, "loss": 0.7855, "step": 5949 }, { "epoch": 2.475053947949978, "grad_norm": 2.06453013420105, "learning_rate": 1.5587379053325813e-06, "loss": 1.0202, "step": 5950 }, { "epoch": 2.4754699321425786, "grad_norm": 2.0085737705230713, "learning_rate": 1.5563366595382356e-06, "loss": 0.8408, "step": 5951 }, { "epoch": 2.475885916335179, "grad_norm": 1.8905580043792725, "learning_rate": 1.5539371086546351e-06, "loss": 0.6968, "step": 5952 }, { "epoch": 2.4763019005277798, "grad_norm": 21.79802703857422, "learning_rate": 1.551539253163442e-06, "loss": 0.848, "step": 5953 }, { "epoch": 2.476717884720381, "grad_norm": 1.8518203496932983, "learning_rate": 1.549143093545985e-06, "loss": 0.7365, "step": 5954 }, { "epoch": 2.4771338689129814, "grad_norm": 28.295551300048828, "learning_rate": 1.5467486302832446e-06, "loss": 0.7635, "step": 5955 }, { "epoch": 2.477549853105582, "grad_norm": 1.8436691761016846, "learning_rate": 1.544355863855871e-06, "loss": 0.7726, "step": 5956 }, { "epoch": 2.4779658372981825, "grad_norm": 1.8697530031204224, "learning_rate": 1.5419647947441663e-06, "loss": 0.7966, "step": 5957 }, { "epoch": 2.4783818214907836, "grad_norm": 1.8640109300613403, "learning_rate": 1.5395754234280902e-06, "loss": 0.7743, "step": 5958 }, { "epoch": 2.478797805683384, "grad_norm": 1.7455848455429077, "learning_rate": 1.5371877503872701e-06, "loss": 0.8728, "step": 5959 }, { "epoch": 2.4792137898759847, "grad_norm": 1.8549988269805908, "learning_rate": 1.534801776100986e-06, "loss": 0.8044, "step": 5960 }, { "epoch": 2.4796297740685853, "grad_norm": 2.0726046562194824, "learning_rate": 1.5324175010481746e-06, "loss": 0.8116, "step": 5961 }, { "epoch": 2.480045758261186, "grad_norm": 1.8354144096374512, "learning_rate": 1.530034925707442e-06, "loss": 0.8641, "step": 5962 }, { "epoch": 2.480461742453787, "grad_norm": 1.9948244094848633, "learning_rate": 1.5276540505570425e-06, "loss": 0.8405, "step": 5963 }, { "epoch": 2.4808777266463875, "grad_norm": 1.895484209060669, "learning_rate": 1.5252748760748925e-06, "loss": 0.7945, "step": 5964 }, { "epoch": 2.481293710838988, "grad_norm": 1.719207763671875, "learning_rate": 1.5228974027385722e-06, "loss": 0.7841, "step": 5965 }, { "epoch": 2.4817096950315887, "grad_norm": 1.921858549118042, "learning_rate": 1.5205216310253114e-06, "loss": 0.8054, "step": 5966 }, { "epoch": 2.4821256792241897, "grad_norm": 1.9123584032058716, "learning_rate": 1.5181475614120057e-06, "loss": 0.8448, "step": 5967 }, { "epoch": 2.4825416634167903, "grad_norm": 1.7730045318603516, "learning_rate": 1.5157751943752053e-06, "loss": 0.7576, "step": 5968 }, { "epoch": 2.482957647609391, "grad_norm": 1.8186222314834595, "learning_rate": 1.5134045303911193e-06, "loss": 0.8135, "step": 5969 }, { "epoch": 2.4833736318019914, "grad_norm": 1.844147801399231, "learning_rate": 1.5110355699356116e-06, "loss": 0.6751, "step": 5970 }, { "epoch": 2.483789615994592, "grad_norm": 2.1982481479644775, "learning_rate": 1.5086683134842127e-06, "loss": 0.8207, "step": 5971 }, { "epoch": 2.484205600187193, "grad_norm": 2.021629810333252, "learning_rate": 1.5063027615121028e-06, "loss": 0.8201, "step": 5972 }, { "epoch": 2.4846215843797936, "grad_norm": 1.8399536609649658, "learning_rate": 1.5039389144941262e-06, "loss": 0.7295, "step": 5973 }, { "epoch": 2.485037568572394, "grad_norm": 1.9019595384597778, "learning_rate": 1.501576772904777e-06, "loss": 0.7653, "step": 5974 }, { "epoch": 2.485453552764995, "grad_norm": 1.977486491203308, "learning_rate": 1.4992163372182178e-06, "loss": 0.834, "step": 5975 }, { "epoch": 2.485869536957596, "grad_norm": 1.8312535285949707, "learning_rate": 1.4968576079082564e-06, "loss": 0.6648, "step": 5976 }, { "epoch": 2.4862855211501964, "grad_norm": 1.8896386623382568, "learning_rate": 1.4945005854483718e-06, "loss": 0.7494, "step": 5977 }, { "epoch": 2.486701505342797, "grad_norm": 6.941684246063232, "learning_rate": 1.492145270311688e-06, "loss": 0.8327, "step": 5978 }, { "epoch": 2.4871174895353976, "grad_norm": 2.121537685394287, "learning_rate": 1.4897916629709907e-06, "loss": 0.8428, "step": 5979 }, { "epoch": 2.487533473727998, "grad_norm": 1.9119688272476196, "learning_rate": 1.487439763898728e-06, "loss": 0.6373, "step": 5980 }, { "epoch": 2.487949457920599, "grad_norm": 1.9941112995147705, "learning_rate": 1.4850895735669958e-06, "loss": 0.7882, "step": 5981 }, { "epoch": 2.4883654421131998, "grad_norm": 1.9045805931091309, "learning_rate": 1.4827410924475548e-06, "loss": 0.8749, "step": 5982 }, { "epoch": 2.4887814263058003, "grad_norm": 1.8512438535690308, "learning_rate": 1.4803943210118177e-06, "loss": 0.782, "step": 5983 }, { "epoch": 2.489197410498401, "grad_norm": 1.7961496114730835, "learning_rate": 1.4780492597308583e-06, "loss": 0.7573, "step": 5984 }, { "epoch": 2.489613394691002, "grad_norm": 46.92148208618164, "learning_rate": 1.475705909075401e-06, "loss": 0.7001, "step": 5985 }, { "epoch": 2.4900293788836025, "grad_norm": 2.053276300430298, "learning_rate": 1.4733642695158357e-06, "loss": 0.7832, "step": 5986 }, { "epoch": 2.490445363076203, "grad_norm": 2.113065004348755, "learning_rate": 1.4710243415221993e-06, "loss": 0.8807, "step": 5987 }, { "epoch": 2.4908613472688037, "grad_norm": 1.913561463356018, "learning_rate": 1.4686861255641915e-06, "loss": 0.7673, "step": 5988 }, { "epoch": 2.4912773314614043, "grad_norm": 6.335549354553223, "learning_rate": 1.466349622111164e-06, "loss": 0.7927, "step": 5989 }, { "epoch": 2.4916933156540053, "grad_norm": 2.0657875537872314, "learning_rate": 1.46401483163213e-06, "loss": 0.8835, "step": 5990 }, { "epoch": 2.492109299846606, "grad_norm": 1.788919448852539, "learning_rate": 1.4616817545957528e-06, "loss": 0.7894, "step": 5991 }, { "epoch": 2.4925252840392065, "grad_norm": 1.9389317035675049, "learning_rate": 1.45935039147036e-06, "loss": 0.7142, "step": 5992 }, { "epoch": 2.492941268231807, "grad_norm": 1.811426043510437, "learning_rate": 1.4570207427239235e-06, "loss": 0.7709, "step": 5993 }, { "epoch": 2.493357252424408, "grad_norm": 1.8371673822402954, "learning_rate": 1.4546928088240809e-06, "loss": 0.7353, "step": 5994 }, { "epoch": 2.4937732366170087, "grad_norm": 1.9096496105194092, "learning_rate": 1.4523665902381245e-06, "loss": 0.7537, "step": 5995 }, { "epoch": 2.4941892208096093, "grad_norm": 5.946926116943359, "learning_rate": 1.450042087432998e-06, "loss": 0.8707, "step": 5996 }, { "epoch": 2.49460520500221, "grad_norm": 2.087149143218994, "learning_rate": 1.447719300875302e-06, "loss": 0.7657, "step": 5997 }, { "epoch": 2.4950211891948104, "grad_norm": 1.8551939725875854, "learning_rate": 1.445398231031293e-06, "loss": 0.8305, "step": 5998 }, { "epoch": 2.4954371733874114, "grad_norm": 1.9663478136062622, "learning_rate": 1.4430788783668847e-06, "loss": 0.8184, "step": 5999 }, { "epoch": 2.495853157580012, "grad_norm": 2.0358848571777344, "learning_rate": 1.4407612433476437e-06, "loss": 0.8766, "step": 6000 }, { "epoch": 2.495853157580012, "eval_loss": 0.7538391351699829, "eval_runtime": 1910.3309, "eval_samples_per_second": 3.45, "eval_steps_per_second": 1.725, "step": 6000 }, { "epoch": 2.4962691417726126, "grad_norm": 2.0300769805908203, "learning_rate": 1.4384453264387944e-06, "loss": 0.7936, "step": 6001 }, { "epoch": 2.496685125965213, "grad_norm": 2.0096681118011475, "learning_rate": 1.4361311281052114e-06, "loss": 0.7489, "step": 6002 }, { "epoch": 2.497101110157814, "grad_norm": 1.75189208984375, "learning_rate": 1.4338186488114326e-06, "loss": 0.7107, "step": 6003 }, { "epoch": 2.497517094350415, "grad_norm": 1.9313122034072876, "learning_rate": 1.4315078890216395e-06, "loss": 0.7647, "step": 6004 }, { "epoch": 2.4979330785430154, "grad_norm": 2.004495859146118, "learning_rate": 1.4291988491996823e-06, "loss": 0.7624, "step": 6005 }, { "epoch": 2.498349062735616, "grad_norm": 1.97075355052948, "learning_rate": 1.4268915298090502e-06, "loss": 0.801, "step": 6006 }, { "epoch": 2.4987650469282165, "grad_norm": 1.9349191188812256, "learning_rate": 1.4245859313129028e-06, "loss": 0.8234, "step": 6007 }, { "epoch": 2.4991810311208176, "grad_norm": 21.854036331176758, "learning_rate": 1.4222820541740446e-06, "loss": 0.862, "step": 6008 }, { "epoch": 2.499597015313418, "grad_norm": 2.0344767570495605, "learning_rate": 1.4199798988549318e-06, "loss": 0.8845, "step": 6009 }, { "epoch": 2.5000129995060187, "grad_norm": 1.9776781797409058, "learning_rate": 1.4176794658176862e-06, "loss": 0.7587, "step": 6010 }, { "epoch": 2.5004289836986193, "grad_norm": 1.8518636226654053, "learning_rate": 1.4153807555240739e-06, "loss": 0.8298, "step": 6011 }, { "epoch": 2.5008449678912203, "grad_norm": 1.955570936203003, "learning_rate": 1.4130837684355213e-06, "loss": 0.7224, "step": 6012 }, { "epoch": 2.501260952083821, "grad_norm": 1.993309497833252, "learning_rate": 1.4107885050131032e-06, "loss": 0.7552, "step": 6013 }, { "epoch": 2.5016769362764215, "grad_norm": 1.8720539808273315, "learning_rate": 1.4084949657175572e-06, "loss": 0.8046, "step": 6014 }, { "epoch": 2.502092920469022, "grad_norm": 1.9599347114562988, "learning_rate": 1.4062031510092622e-06, "loss": 0.7868, "step": 6015 }, { "epoch": 2.5025089046616227, "grad_norm": 1.9658520221710205, "learning_rate": 1.4039130613482655e-06, "loss": 0.7952, "step": 6016 }, { "epoch": 2.5029248888542233, "grad_norm": 1.7448917627334595, "learning_rate": 1.401624697194256e-06, "loss": 0.7729, "step": 6017 }, { "epoch": 2.5033408730468243, "grad_norm": 1.8679859638214111, "learning_rate": 1.3993380590065808e-06, "loss": 0.7396, "step": 6018 }, { "epoch": 2.503756857239425, "grad_norm": 1.7054389715194702, "learning_rate": 1.3970531472442406e-06, "loss": 0.7693, "step": 6019 }, { "epoch": 2.5041728414320255, "grad_norm": 1.801910638809204, "learning_rate": 1.3947699623658929e-06, "loss": 0.6821, "step": 6020 }, { "epoch": 2.5045888256246265, "grad_norm": 2.051417350769043, "learning_rate": 1.3924885048298397e-06, "loss": 0.8059, "step": 6021 }, { "epoch": 2.505004809817227, "grad_norm": 1.895462155342102, "learning_rate": 1.3902087750940484e-06, "loss": 0.804, "step": 6022 }, { "epoch": 2.5054207940098276, "grad_norm": 1.8880268335342407, "learning_rate": 1.3879307736161263e-06, "loss": 0.7077, "step": 6023 }, { "epoch": 2.5058367782024282, "grad_norm": 1.909010887145996, "learning_rate": 1.3856545008533461e-06, "loss": 0.6906, "step": 6024 }, { "epoch": 2.506252762395029, "grad_norm": 1.825696349143982, "learning_rate": 1.3833799572626238e-06, "loss": 0.8832, "step": 6025 }, { "epoch": 2.5066687465876294, "grad_norm": 1.8625868558883667, "learning_rate": 1.3811071433005364e-06, "loss": 0.8442, "step": 6026 }, { "epoch": 2.5070847307802304, "grad_norm": 1.8346734046936035, "learning_rate": 1.3788360594233075e-06, "loss": 0.813, "step": 6027 }, { "epoch": 2.507500714972831, "grad_norm": 2.015834093093872, "learning_rate": 1.3765667060868126e-06, "loss": 0.8994, "step": 6028 }, { "epoch": 2.5079166991654316, "grad_norm": 2.0588505268096924, "learning_rate": 1.3742990837465887e-06, "loss": 0.8972, "step": 6029 }, { "epoch": 2.5083326833580326, "grad_norm": 2.0460894107818604, "learning_rate": 1.3720331928578134e-06, "loss": 0.8089, "step": 6030 }, { "epoch": 2.508748667550633, "grad_norm": 10.785996437072754, "learning_rate": 1.369769033875329e-06, "loss": 0.7412, "step": 6031 }, { "epoch": 2.5091646517432338, "grad_norm": 25.985294342041016, "learning_rate": 1.3675066072536169e-06, "loss": 0.7754, "step": 6032 }, { "epoch": 2.5095806359358344, "grad_norm": 1.978331446647644, "learning_rate": 1.3652459134468254e-06, "loss": 0.7955, "step": 6033 }, { "epoch": 2.509996620128435, "grad_norm": 1.8205375671386719, "learning_rate": 1.3629869529087404e-06, "loss": 0.8761, "step": 6034 }, { "epoch": 2.5104126043210355, "grad_norm": 1.8538192510604858, "learning_rate": 1.3607297260928121e-06, "loss": 0.8775, "step": 6035 }, { "epoch": 2.5108285885136366, "grad_norm": 2.1269702911376953, "learning_rate": 1.358474233452136e-06, "loss": 0.7955, "step": 6036 }, { "epoch": 2.511244572706237, "grad_norm": 43.907249450683594, "learning_rate": 1.3562204754394581e-06, "loss": 0.7307, "step": 6037 }, { "epoch": 2.5116605568988377, "grad_norm": 1.764346718788147, "learning_rate": 1.3539684525071838e-06, "loss": 0.8154, "step": 6038 }, { "epoch": 2.5120765410914387, "grad_norm": 1.8959097862243652, "learning_rate": 1.351718165107362e-06, "loss": 0.8384, "step": 6039 }, { "epoch": 2.5124925252840393, "grad_norm": 1.9102507829666138, "learning_rate": 1.3494696136916964e-06, "loss": 0.7839, "step": 6040 }, { "epoch": 2.51290850947664, "grad_norm": 1.8298900127410889, "learning_rate": 1.3472227987115427e-06, "loss": 0.7459, "step": 6041 }, { "epoch": 2.5133244936692405, "grad_norm": 18.623674392700195, "learning_rate": 1.3449777206179115e-06, "loss": 0.7981, "step": 6042 }, { "epoch": 2.513740477861841, "grad_norm": 1.808014154434204, "learning_rate": 1.3427343798614568e-06, "loss": 0.7437, "step": 6043 }, { "epoch": 2.5141564620544417, "grad_norm": 216.12960815429688, "learning_rate": 1.3404927768924913e-06, "loss": 0.7629, "step": 6044 }, { "epoch": 2.5145724462470427, "grad_norm": 1.9887789487838745, "learning_rate": 1.3382529121609723e-06, "loss": 0.7298, "step": 6045 }, { "epoch": 2.5149884304396433, "grad_norm": 1.694122314453125, "learning_rate": 1.3360147861165152e-06, "loss": 0.8185, "step": 6046 }, { "epoch": 2.515404414632244, "grad_norm": 1.907802939414978, "learning_rate": 1.333778399208382e-06, "loss": 0.8616, "step": 6047 }, { "epoch": 2.515820398824845, "grad_norm": 1.801776647567749, "learning_rate": 1.3315437518854845e-06, "loss": 0.7662, "step": 6048 }, { "epoch": 2.5162363830174455, "grad_norm": 1.9337139129638672, "learning_rate": 1.3293108445963864e-06, "loss": 0.8016, "step": 6049 }, { "epoch": 2.516652367210046, "grad_norm": 1.998667597770691, "learning_rate": 1.3270796777893081e-06, "loss": 0.8373, "step": 6050 }, { "epoch": 2.5170683514026466, "grad_norm": 1.8953278064727783, "learning_rate": 1.3248502519121087e-06, "loss": 0.7437, "step": 6051 }, { "epoch": 2.517484335595247, "grad_norm": 1.9348222017288208, "learning_rate": 1.3226225674123105e-06, "loss": 0.8837, "step": 6052 }, { "epoch": 2.517900319787848, "grad_norm": 1.9105210304260254, "learning_rate": 1.320396624737076e-06, "loss": 0.7685, "step": 6053 }, { "epoch": 2.518316303980449, "grad_norm": 1.9928326606750488, "learning_rate": 1.3181724243332273e-06, "loss": 0.7969, "step": 6054 }, { "epoch": 2.5187322881730494, "grad_norm": 1.9098637104034424, "learning_rate": 1.3159499666472252e-06, "loss": 0.771, "step": 6055 }, { "epoch": 2.51914827236565, "grad_norm": 1.8937972784042358, "learning_rate": 1.3137292521251943e-06, "loss": 0.8935, "step": 6056 }, { "epoch": 2.519564256558251, "grad_norm": 1.9241143465042114, "learning_rate": 1.3115102812128987e-06, "loss": 0.8263, "step": 6057 }, { "epoch": 2.5199802407508516, "grad_norm": 1.925079107284546, "learning_rate": 1.3092930543557548e-06, "loss": 0.6969, "step": 6058 }, { "epoch": 2.520396224943452, "grad_norm": 1.8147188425064087, "learning_rate": 1.3070775719988338e-06, "loss": 0.7863, "step": 6059 }, { "epoch": 2.5208122091360528, "grad_norm": 2.0065839290618896, "learning_rate": 1.3048638345868504e-06, "loss": 0.8355, "step": 6060 }, { "epoch": 2.5212281933286533, "grad_norm": 1.9998606443405151, "learning_rate": 1.3026518425641744e-06, "loss": 0.8119, "step": 6061 }, { "epoch": 2.521644177521254, "grad_norm": 1.8194388151168823, "learning_rate": 1.3004415963748184e-06, "loss": 0.7506, "step": 6062 }, { "epoch": 2.522060161713855, "grad_norm": 1.9624232053756714, "learning_rate": 1.2982330964624545e-06, "loss": 0.7773, "step": 6063 }, { "epoch": 2.5224761459064555, "grad_norm": 4.4930100440979, "learning_rate": 1.2960263432703946e-06, "loss": 0.7933, "step": 6064 }, { "epoch": 2.522892130099056, "grad_norm": 1.9534577131271362, "learning_rate": 1.2938213372416064e-06, "loss": 0.7908, "step": 6065 }, { "epoch": 2.523308114291657, "grad_norm": 1.9029099941253662, "learning_rate": 1.2916180788187026e-06, "loss": 0.9226, "step": 6066 }, { "epoch": 2.5237240984842577, "grad_norm": 1.9403148889541626, "learning_rate": 1.2894165684439487e-06, "loss": 0.7597, "step": 6067 }, { "epoch": 2.5241400826768583, "grad_norm": 1.859431505203247, "learning_rate": 1.2872168065592528e-06, "loss": 0.7021, "step": 6068 }, { "epoch": 2.524556066869459, "grad_norm": 1.7005311250686646, "learning_rate": 1.2850187936061843e-06, "loss": 0.7062, "step": 6069 }, { "epoch": 2.5249720510620595, "grad_norm": 1.894760251045227, "learning_rate": 1.2828225300259468e-06, "loss": 0.8528, "step": 6070 }, { "epoch": 2.52538803525466, "grad_norm": 2.0144340991973877, "learning_rate": 1.2806280162594043e-06, "loss": 0.7864, "step": 6071 }, { "epoch": 2.525804019447261, "grad_norm": 1.9114317893981934, "learning_rate": 1.2784352527470656e-06, "loss": 0.7713, "step": 6072 }, { "epoch": 2.5262200036398617, "grad_norm": 1.8566858768463135, "learning_rate": 1.2762442399290842e-06, "loss": 0.7108, "step": 6073 }, { "epoch": 2.5266359878324622, "grad_norm": 1.8441675901412964, "learning_rate": 1.2740549782452715e-06, "loss": 0.7148, "step": 6074 }, { "epoch": 2.5270519720250633, "grad_norm": 1.9733723402023315, "learning_rate": 1.2718674681350774e-06, "loss": 0.8796, "step": 6075 }, { "epoch": 2.527467956217664, "grad_norm": 2.0139825344085693, "learning_rate": 1.2696817100376058e-06, "loss": 0.8135, "step": 6076 }, { "epoch": 2.5278839404102644, "grad_norm": 1.9802703857421875, "learning_rate": 1.2674977043916048e-06, "loss": 0.7069, "step": 6077 }, { "epoch": 2.528299924602865, "grad_norm": 3.1224207878112793, "learning_rate": 1.2653154516354782e-06, "loss": 0.8107, "step": 6078 }, { "epoch": 2.5287159087954656, "grad_norm": 1.9969098567962646, "learning_rate": 1.263134952207269e-06, "loss": 0.8571, "step": 6079 }, { "epoch": 2.529131892988066, "grad_norm": 2.058087110519409, "learning_rate": 1.2609562065446757e-06, "loss": 0.7555, "step": 6080 }, { "epoch": 2.529547877180667, "grad_norm": 1.8755077123641968, "learning_rate": 1.2587792150850387e-06, "loss": 0.7486, "step": 6081 }, { "epoch": 2.529963861373268, "grad_norm": 1.903601050376892, "learning_rate": 1.2566039782653528e-06, "loss": 0.8258, "step": 6082 }, { "epoch": 2.5303798455658684, "grad_norm": 2.0073041915893555, "learning_rate": 1.254430496522253e-06, "loss": 0.8086, "step": 6083 }, { "epoch": 2.5307958297584694, "grad_norm": 1.9053555727005005, "learning_rate": 1.2522587702920307e-06, "loss": 0.8496, "step": 6084 }, { "epoch": 2.53121181395107, "grad_norm": 2.0167293548583984, "learning_rate": 1.2500888000106147e-06, "loss": 0.8385, "step": 6085 }, { "epoch": 2.5316277981436706, "grad_norm": 1.920691967010498, "learning_rate": 1.2479205861135912e-06, "loss": 0.7618, "step": 6086 }, { "epoch": 2.532043782336271, "grad_norm": 1.8306752443313599, "learning_rate": 1.2457541290361886e-06, "loss": 0.768, "step": 6087 }, { "epoch": 2.5324597665288717, "grad_norm": 1.9551059007644653, "learning_rate": 1.24358942921328e-06, "loss": 0.8021, "step": 6088 }, { "epoch": 2.5328757507214723, "grad_norm": 1.7888106107711792, "learning_rate": 1.2414264870793936e-06, "loss": 0.8688, "step": 6089 }, { "epoch": 2.5332917349140733, "grad_norm": 1.9878182411193848, "learning_rate": 1.2392653030686962e-06, "loss": 0.9392, "step": 6090 }, { "epoch": 2.533707719106674, "grad_norm": 2.924283981323242, "learning_rate": 1.2371058776150114e-06, "loss": 0.8181, "step": 6091 }, { "epoch": 2.5341237032992745, "grad_norm": 1.9681164026260376, "learning_rate": 1.2349482111517986e-06, "loss": 0.7607, "step": 6092 }, { "epoch": 2.5345396874918755, "grad_norm": 1.8864964246749878, "learning_rate": 1.2327923041121748e-06, "loss": 0.8541, "step": 6093 }, { "epoch": 2.534955671684476, "grad_norm": 1.899088978767395, "learning_rate": 1.2306381569288939e-06, "loss": 0.7525, "step": 6094 }, { "epoch": 2.5353716558770767, "grad_norm": 2.061447858810425, "learning_rate": 1.2284857700343667e-06, "loss": 0.9323, "step": 6095 }, { "epoch": 2.5357876400696773, "grad_norm": 1.7577241659164429, "learning_rate": 1.226335143860642e-06, "loss": 0.777, "step": 6096 }, { "epoch": 2.536203624262278, "grad_norm": 2.160886287689209, "learning_rate": 1.2241862788394199e-06, "loss": 0.891, "step": 6097 }, { "epoch": 2.5366196084548784, "grad_norm": 1.8037678003311157, "learning_rate": 1.2220391754020434e-06, "loss": 0.8188, "step": 6098 }, { "epoch": 2.5370355926474795, "grad_norm": 1.8797612190246582, "learning_rate": 1.2198938339795073e-06, "loss": 0.8062, "step": 6099 }, { "epoch": 2.53745157684008, "grad_norm": 2.0172786712646484, "learning_rate": 1.2177502550024467e-06, "loss": 0.7452, "step": 6100 }, { "epoch": 2.5378675610326806, "grad_norm": 1.9038630723953247, "learning_rate": 1.2156084389011469e-06, "loss": 0.7801, "step": 6101 }, { "epoch": 2.5382835452252817, "grad_norm": 1.7937617301940918, "learning_rate": 1.2134683861055418e-06, "loss": 0.8108, "step": 6102 }, { "epoch": 2.5386995294178822, "grad_norm": 1.956684947013855, "learning_rate": 1.2113300970452012e-06, "loss": 0.8034, "step": 6103 }, { "epoch": 2.539115513610483, "grad_norm": 1.784388542175293, "learning_rate": 1.2091935721493541e-06, "loss": 0.8129, "step": 6104 }, { "epoch": 2.5395314978030834, "grad_norm": 1.8211455345153809, "learning_rate": 1.2070588118468663e-06, "loss": 0.7673, "step": 6105 }, { "epoch": 2.539947481995684, "grad_norm": 2.0464730262756348, "learning_rate": 1.204925816566249e-06, "loss": 0.7494, "step": 6106 }, { "epoch": 2.5403634661882846, "grad_norm": 1.8728224039077759, "learning_rate": 1.2027945867356638e-06, "loss": 0.7924, "step": 6107 }, { "epoch": 2.5407794503808856, "grad_norm": 1.8370201587677002, "learning_rate": 1.200665122782918e-06, "loss": 0.8125, "step": 6108 }, { "epoch": 2.541195434573486, "grad_norm": 1.8393415212631226, "learning_rate": 1.1985374251354587e-06, "loss": 0.6424, "step": 6109 }, { "epoch": 2.5416114187660868, "grad_norm": 1.840001106262207, "learning_rate": 1.1964114942203864e-06, "loss": 0.739, "step": 6110 }, { "epoch": 2.542027402958688, "grad_norm": 1.9397236108779907, "learning_rate": 1.1942873304644387e-06, "loss": 0.7369, "step": 6111 }, { "epoch": 2.5424433871512884, "grad_norm": 1.9208439588546753, "learning_rate": 1.1921649342940056e-06, "loss": 0.7003, "step": 6112 }, { "epoch": 2.542859371343889, "grad_norm": 2.4416747093200684, "learning_rate": 1.1900443061351175e-06, "loss": 0.7963, "step": 6113 }, { "epoch": 2.5432753555364895, "grad_norm": 2.037147045135498, "learning_rate": 1.1879254464134538e-06, "loss": 0.7796, "step": 6114 }, { "epoch": 2.54369133972909, "grad_norm": 6.340185165405273, "learning_rate": 1.1858083555543353e-06, "loss": 0.805, "step": 6115 }, { "epoch": 2.5441073239216907, "grad_norm": 1.9366203546524048, "learning_rate": 1.1836930339827279e-06, "loss": 0.8142, "step": 6116 }, { "epoch": 2.5445233081142917, "grad_norm": 1.848928689956665, "learning_rate": 1.1815794821232473e-06, "loss": 0.8505, "step": 6117 }, { "epoch": 2.5449392923068923, "grad_norm": 2.154536485671997, "learning_rate": 1.1794677004001454e-06, "loss": 0.9521, "step": 6118 }, { "epoch": 2.545355276499493, "grad_norm": 1.8444474935531616, "learning_rate": 1.1773576892373283e-06, "loss": 0.7602, "step": 6119 }, { "epoch": 2.545771260692094, "grad_norm": 1.7579290866851807, "learning_rate": 1.1752494490583378e-06, "loss": 0.7934, "step": 6120 }, { "epoch": 2.5461872448846945, "grad_norm": 1.9601240158081055, "learning_rate": 1.1731429802863692e-06, "loss": 0.8329, "step": 6121 }, { "epoch": 2.546603229077295, "grad_norm": 1.9152882099151611, "learning_rate": 1.1710382833442534e-06, "loss": 0.9067, "step": 6122 }, { "epoch": 2.5470192132698957, "grad_norm": 1.9436745643615723, "learning_rate": 1.1689353586544728e-06, "loss": 0.756, "step": 6123 }, { "epoch": 2.5474351974624962, "grad_norm": 1.8954846858978271, "learning_rate": 1.1668342066391492e-06, "loss": 0.806, "step": 6124 }, { "epoch": 2.547851181655097, "grad_norm": 1.969004511833191, "learning_rate": 1.164734827720051e-06, "loss": 0.7834, "step": 6125 }, { "epoch": 2.548267165847698, "grad_norm": 2.191436529159546, "learning_rate": 1.1626372223185911e-06, "loss": 0.8293, "step": 6126 }, { "epoch": 2.5486831500402984, "grad_norm": 1.9594650268554688, "learning_rate": 1.1605413908558238e-06, "loss": 0.8297, "step": 6127 }, { "epoch": 2.549099134232899, "grad_norm": 1.8739537000656128, "learning_rate": 1.1584473337524482e-06, "loss": 0.7407, "step": 6128 }, { "epoch": 2.5495151184255, "grad_norm": 1.856955885887146, "learning_rate": 1.1563550514288102e-06, "loss": 0.798, "step": 6129 }, { "epoch": 2.5499311026181006, "grad_norm": 1.9170515537261963, "learning_rate": 1.1542645443048938e-06, "loss": 0.7388, "step": 6130 }, { "epoch": 2.550347086810701, "grad_norm": 1.89152991771698, "learning_rate": 1.1521758128003325e-06, "loss": 0.8034, "step": 6131 }, { "epoch": 2.550763071003302, "grad_norm": 1.7772866487503052, "learning_rate": 1.1500888573344026e-06, "loss": 0.8363, "step": 6132 }, { "epoch": 2.5511790551959024, "grad_norm": 2.026644229888916, "learning_rate": 1.1480036783260184e-06, "loss": 0.8681, "step": 6133 }, { "epoch": 2.551595039388503, "grad_norm": 2.203859329223633, "learning_rate": 1.1459202761937437e-06, "loss": 0.8066, "step": 6134 }, { "epoch": 2.552011023581104, "grad_norm": 1.8494969606399536, "learning_rate": 1.1438386513557842e-06, "loss": 0.6685, "step": 6135 }, { "epoch": 2.5524270077737046, "grad_norm": 2.0562522411346436, "learning_rate": 1.1417588042299865e-06, "loss": 0.8902, "step": 6136 }, { "epoch": 2.552842991966305, "grad_norm": 1.819551706314087, "learning_rate": 1.139680735233839e-06, "loss": 0.7358, "step": 6137 }, { "epoch": 2.553258976158906, "grad_norm": 1.8060340881347656, "learning_rate": 1.1376044447844814e-06, "loss": 0.7795, "step": 6138 }, { "epoch": 2.5536749603515068, "grad_norm": 2.229288101196289, "learning_rate": 1.1355299332986859e-06, "loss": 0.7966, "step": 6139 }, { "epoch": 2.5540909445441073, "grad_norm": 1.9457136392593384, "learning_rate": 1.133457201192877e-06, "loss": 0.7739, "step": 6140 }, { "epoch": 2.554506928736708, "grad_norm": 3.336345672607422, "learning_rate": 1.131386248883114e-06, "loss": 0.6761, "step": 6141 }, { "epoch": 2.5549229129293085, "grad_norm": 1.8323084115982056, "learning_rate": 1.129317076785107e-06, "loss": 0.8399, "step": 6142 }, { "epoch": 2.555338897121909, "grad_norm": 1.8177874088287354, "learning_rate": 1.127249685314199e-06, "loss": 0.8508, "step": 6143 }, { "epoch": 2.55575488131451, "grad_norm": 3.3723626136779785, "learning_rate": 1.125184074885387e-06, "loss": 0.8465, "step": 6144 }, { "epoch": 2.5561708655071107, "grad_norm": 1.9039000272750854, "learning_rate": 1.1231202459133005e-06, "loss": 0.7462, "step": 6145 }, { "epoch": 2.5565868496997113, "grad_norm": 1.9394031763076782, "learning_rate": 1.1210581988122137e-06, "loss": 0.772, "step": 6146 }, { "epoch": 2.5570028338923123, "grad_norm": 2.0668740272521973, "learning_rate": 1.11899793399605e-06, "loss": 0.8606, "step": 6147 }, { "epoch": 2.557418818084913, "grad_norm": 1.9317421913146973, "learning_rate": 1.116939451878365e-06, "loss": 0.7395, "step": 6148 }, { "epoch": 2.5578348022775135, "grad_norm": 1.7111077308654785, "learning_rate": 1.114882752872365e-06, "loss": 0.5582, "step": 6149 }, { "epoch": 2.558250786470114, "grad_norm": 1.8354562520980835, "learning_rate": 1.1128278373908918e-06, "loss": 0.9116, "step": 6150 }, { "epoch": 2.5586667706627146, "grad_norm": 1.830639362335205, "learning_rate": 1.1107747058464346e-06, "loss": 0.7767, "step": 6151 }, { "epoch": 2.5590827548553152, "grad_norm": 1.807738184928894, "learning_rate": 1.1087233586511181e-06, "loss": 0.8413, "step": 6152 }, { "epoch": 2.5594987390479162, "grad_norm": 1.861356258392334, "learning_rate": 1.1066737962167184e-06, "loss": 0.8123, "step": 6153 }, { "epoch": 2.559914723240517, "grad_norm": 1.8856614828109741, "learning_rate": 1.104626018954643e-06, "loss": 0.9413, "step": 6154 }, { "epoch": 2.5603307074331174, "grad_norm": 1.960593581199646, "learning_rate": 1.102580027275948e-06, "loss": 0.8708, "step": 6155 }, { "epoch": 2.5607466916257184, "grad_norm": 1.8270268440246582, "learning_rate": 1.1005358215913253e-06, "loss": 0.9159, "step": 6156 }, { "epoch": 2.561162675818319, "grad_norm": 1.8612862825393677, "learning_rate": 1.098493402311116e-06, "loss": 0.8446, "step": 6157 }, { "epoch": 2.5615786600109196, "grad_norm": 2.0115482807159424, "learning_rate": 1.0964527698452954e-06, "loss": 0.871, "step": 6158 }, { "epoch": 2.56199464420352, "grad_norm": 1.8900060653686523, "learning_rate": 1.0944139246034857e-06, "loss": 0.7673, "step": 6159 }, { "epoch": 2.5624106283961208, "grad_norm": 1.961916446685791, "learning_rate": 1.0923768669949442e-06, "loss": 0.9103, "step": 6160 }, { "epoch": 2.5628266125887214, "grad_norm": 14.13252067565918, "learning_rate": 1.0903415974285768e-06, "loss": 0.7303, "step": 6161 }, { "epoch": 2.5632425967813224, "grad_norm": 2.0570693016052246, "learning_rate": 1.0883081163129239e-06, "loss": 0.7785, "step": 6162 }, { "epoch": 2.563658580973923, "grad_norm": 1.9567428827285767, "learning_rate": 1.0862764240561696e-06, "loss": 0.8094, "step": 6163 }, { "epoch": 2.5640745651665235, "grad_norm": 1.829172134399414, "learning_rate": 1.0842465210661423e-06, "loss": 0.8015, "step": 6164 }, { "epoch": 2.5644905493591246, "grad_norm": 1.8739553689956665, "learning_rate": 1.0822184077503051e-06, "loss": 0.6937, "step": 6165 }, { "epoch": 2.564906533551725, "grad_norm": 34.18769454956055, "learning_rate": 1.0801920845157642e-06, "loss": 0.8658, "step": 6166 }, { "epoch": 2.5653225177443257, "grad_norm": 2.0917863845825195, "learning_rate": 1.0781675517692658e-06, "loss": 0.7939, "step": 6167 }, { "epoch": 2.5657385019369263, "grad_norm": 3.746821641921997, "learning_rate": 1.0761448099172022e-06, "loss": 0.8212, "step": 6168 }, { "epoch": 2.566154486129527, "grad_norm": 2.239213705062866, "learning_rate": 1.0741238593655966e-06, "loss": 0.7048, "step": 6169 }, { "epoch": 2.5665704703221275, "grad_norm": 1.7470910549163818, "learning_rate": 1.0721047005201225e-06, "loss": 0.6723, "step": 6170 }, { "epoch": 2.5669864545147285, "grad_norm": 2.083158254623413, "learning_rate": 1.0700873337860839e-06, "loss": 0.6832, "step": 6171 }, { "epoch": 2.567402438707329, "grad_norm": 2.0109589099884033, "learning_rate": 1.0680717595684353e-06, "loss": 0.9053, "step": 6172 }, { "epoch": 2.5678184228999297, "grad_norm": 1.9500856399536133, "learning_rate": 1.0660579782717628e-06, "loss": 0.7716, "step": 6173 }, { "epoch": 2.5682344070925307, "grad_norm": 1.9319498538970947, "learning_rate": 1.0640459903002987e-06, "loss": 0.8778, "step": 6174 }, { "epoch": 2.5686503912851313, "grad_norm": 1.973979115486145, "learning_rate": 1.062035796057912e-06, "loss": 0.8383, "step": 6175 }, { "epoch": 2.569066375477732, "grad_norm": 4.885695934295654, "learning_rate": 1.060027395948111e-06, "loss": 0.7846, "step": 6176 }, { "epoch": 2.5694823596703324, "grad_norm": 1.9895298480987549, "learning_rate": 1.0580207903740448e-06, "loss": 0.8471, "step": 6177 }, { "epoch": 2.569898343862933, "grad_norm": 1.9431301355361938, "learning_rate": 1.0560159797385039e-06, "loss": 0.8955, "step": 6178 }, { "epoch": 2.5703143280555336, "grad_norm": 1.8956605195999146, "learning_rate": 1.054012964443919e-06, "loss": 0.6691, "step": 6179 }, { "epoch": 2.5707303122481346, "grad_norm": 1.9147932529449463, "learning_rate": 1.052011744892355e-06, "loss": 0.7493, "step": 6180 }, { "epoch": 2.5711462964407352, "grad_norm": 1.9265235662460327, "learning_rate": 1.050012321485525e-06, "loss": 0.7628, "step": 6181 }, { "epoch": 2.571562280633336, "grad_norm": 1.9806874990463257, "learning_rate": 1.0480146946247706e-06, "loss": 0.8635, "step": 6182 }, { "epoch": 2.571978264825937, "grad_norm": 36.39236831665039, "learning_rate": 1.0460188647110846e-06, "loss": 0.8136, "step": 6183 }, { "epoch": 2.5723942490185374, "grad_norm": 1.9411072731018066, "learning_rate": 1.0440248321450908e-06, "loss": 0.7288, "step": 6184 }, { "epoch": 2.572810233211138, "grad_norm": 1.8678703308105469, "learning_rate": 1.042032597327054e-06, "loss": 0.8628, "step": 6185 }, { "epoch": 2.5732262174037386, "grad_norm": 4.043085098266602, "learning_rate": 1.040042160656879e-06, "loss": 0.79, "step": 6186 }, { "epoch": 2.573642201596339, "grad_norm": 1.9728857278823853, "learning_rate": 1.0380535225341104e-06, "loss": 0.8593, "step": 6187 }, { "epoch": 2.5740581857889397, "grad_norm": 1.907699465751648, "learning_rate": 1.036066683357928e-06, "loss": 0.8388, "step": 6188 }, { "epoch": 2.5744741699815408, "grad_norm": 1.9123871326446533, "learning_rate": 1.0340816435271594e-06, "loss": 0.8291, "step": 6189 }, { "epoch": 2.5748901541741414, "grad_norm": 1.728877067565918, "learning_rate": 1.0320984034402581e-06, "loss": 0.7849, "step": 6190 }, { "epoch": 2.575306138366742, "grad_norm": 1.9249070882797241, "learning_rate": 1.0301169634953289e-06, "loss": 0.8015, "step": 6191 }, { "epoch": 2.575722122559343, "grad_norm": 1.810433268547058, "learning_rate": 1.0281373240901049e-06, "loss": 0.7454, "step": 6192 }, { "epoch": 2.5761381067519435, "grad_norm": 1.940611720085144, "learning_rate": 1.026159485621967e-06, "loss": 0.7926, "step": 6193 }, { "epoch": 2.576554090944544, "grad_norm": 1.9069896936416626, "learning_rate": 1.0241834484879276e-06, "loss": 0.7798, "step": 6194 }, { "epoch": 2.5769700751371447, "grad_norm": 1.8799222707748413, "learning_rate": 1.0222092130846383e-06, "loss": 0.7082, "step": 6195 }, { "epoch": 2.5773860593297453, "grad_norm": 2.067840099334717, "learning_rate": 1.0202367798083946e-06, "loss": 0.775, "step": 6196 }, { "epoch": 2.577802043522346, "grad_norm": 1.8417654037475586, "learning_rate": 1.0182661490551215e-06, "loss": 0.692, "step": 6197 }, { "epoch": 2.578218027714947, "grad_norm": 1.7423847913742065, "learning_rate": 1.0162973212203931e-06, "loss": 0.6692, "step": 6198 }, { "epoch": 2.5786340119075475, "grad_norm": 1.7823938131332397, "learning_rate": 1.0143302966994084e-06, "loss": 0.765, "step": 6199 }, { "epoch": 2.579049996100148, "grad_norm": 3.3673126697540283, "learning_rate": 1.0123650758870185e-06, "loss": 0.797, "step": 6200 }, { "epoch": 2.579465980292749, "grad_norm": 2.027200698852539, "learning_rate": 1.0104016591777e-06, "loss": 0.7312, "step": 6201 }, { "epoch": 2.5798819644853497, "grad_norm": 2.0005791187286377, "learning_rate": 1.0084400469655775e-06, "loss": 0.8661, "step": 6202 }, { "epoch": 2.5802979486779503, "grad_norm": 1.9521088600158691, "learning_rate": 1.006480239644404e-06, "loss": 0.9016, "step": 6203 }, { "epoch": 2.580713932870551, "grad_norm": 1.7924083471298218, "learning_rate": 1.0045222376075792e-06, "loss": 0.7052, "step": 6204 }, { "epoch": 2.5811299170631514, "grad_norm": 2.0463578701019287, "learning_rate": 1.0025660412481342e-06, "loss": 0.8074, "step": 6205 }, { "epoch": 2.581545901255752, "grad_norm": 1.8904588222503662, "learning_rate": 1.0006116509587404e-06, "loss": 0.7842, "step": 6206 }, { "epoch": 2.581961885448353, "grad_norm": 1.8235810995101929, "learning_rate": 9.986590671317021e-07, "loss": 0.8803, "step": 6207 }, { "epoch": 2.5823778696409536, "grad_norm": 2.0256195068359375, "learning_rate": 9.967082901589675e-07, "loss": 0.8953, "step": 6208 }, { "epoch": 2.582793853833554, "grad_norm": 1.9158756732940674, "learning_rate": 9.947593204321226e-07, "loss": 0.7872, "step": 6209 }, { "epoch": 2.5832098380261552, "grad_norm": 1.9876827001571655, "learning_rate": 9.92812158342381e-07, "loss": 0.796, "step": 6210 }, { "epoch": 2.583625822218756, "grad_norm": 1.9014052152633667, "learning_rate": 9.90866804280607e-07, "loss": 0.8585, "step": 6211 }, { "epoch": 2.5840418064113564, "grad_norm": 1.8707300424575806, "learning_rate": 9.889232586372877e-07, "loss": 0.8371, "step": 6212 }, { "epoch": 2.584457790603957, "grad_norm": 1.894034504890442, "learning_rate": 9.86981521802559e-07, "loss": 0.8129, "step": 6213 }, { "epoch": 2.5848737747965576, "grad_norm": 1.8903992176055908, "learning_rate": 9.850415941661883e-07, "loss": 0.8426, "step": 6214 }, { "epoch": 2.585289758989158, "grad_norm": 1.7902374267578125, "learning_rate": 9.831034761175795e-07, "loss": 0.7643, "step": 6215 }, { "epoch": 2.585705743181759, "grad_norm": 1.9225993156433105, "learning_rate": 9.811671680457723e-07, "loss": 0.7898, "step": 6216 }, { "epoch": 2.5861217273743597, "grad_norm": 76.22055053710938, "learning_rate": 9.792326703394495e-07, "loss": 0.8135, "step": 6217 }, { "epoch": 2.5865377115669603, "grad_norm": 3.6311042308807373, "learning_rate": 9.772999833869223e-07, "loss": 0.7768, "step": 6218 }, { "epoch": 2.5869536957595614, "grad_norm": 1.9425817728042603, "learning_rate": 9.753691075761451e-07, "loss": 0.8054, "step": 6219 }, { "epoch": 2.587369679952162, "grad_norm": 1.9222242832183838, "learning_rate": 9.734400432947033e-07, "loss": 0.738, "step": 6220 }, { "epoch": 2.5877856641447625, "grad_norm": 1.8755673170089722, "learning_rate": 9.715127909298238e-07, "loss": 0.8547, "step": 6221 }, { "epoch": 2.588201648337363, "grad_norm": 1.7262096405029297, "learning_rate": 9.69587350868364e-07, "loss": 0.693, "step": 6222 }, { "epoch": 2.5886176325299637, "grad_norm": 1.6313879489898682, "learning_rate": 9.676637234968245e-07, "loss": 0.7068, "step": 6223 }, { "epoch": 2.5890336167225643, "grad_norm": 1.7478286027908325, "learning_rate": 9.657419092013366e-07, "loss": 0.7353, "step": 6224 }, { "epoch": 2.5894496009151653, "grad_norm": 1.9849282503128052, "learning_rate": 9.63821908367667e-07, "loss": 0.7819, "step": 6225 }, { "epoch": 2.589865585107766, "grad_norm": 1.9725838899612427, "learning_rate": 9.619037213812244e-07, "loss": 0.7563, "step": 6226 }, { "epoch": 2.5902815693003665, "grad_norm": 1.7694125175476074, "learning_rate": 9.599873486270472e-07, "loss": 0.764, "step": 6227 }, { "epoch": 2.5906975534929675, "grad_norm": 1.8023236989974976, "learning_rate": 9.580727904898135e-07, "loss": 0.6986, "step": 6228 }, { "epoch": 2.591113537685568, "grad_norm": 1.9063398838043213, "learning_rate": 9.561600473538345e-07, "loss": 0.8082, "step": 6229 }, { "epoch": 2.5915295218781687, "grad_norm": 2.030937433242798, "learning_rate": 9.5424911960306e-07, "loss": 0.914, "step": 6230 }, { "epoch": 2.5919455060707692, "grad_norm": 1.9061493873596191, "learning_rate": 9.52340007621072e-07, "loss": 0.7746, "step": 6231 }, { "epoch": 2.59236149026337, "grad_norm": 1.8123445510864258, "learning_rate": 9.504327117910916e-07, "loss": 0.6767, "step": 6232 }, { "epoch": 2.5927774744559704, "grad_norm": 1.95522141456604, "learning_rate": 9.485272324959726e-07, "loss": 0.8431, "step": 6233 }, { "epoch": 2.5931934586485714, "grad_norm": 1.8641294240951538, "learning_rate": 9.466235701182058e-07, "loss": 0.7565, "step": 6234 }, { "epoch": 2.593609442841172, "grad_norm": 1.7726755142211914, "learning_rate": 9.447217250399132e-07, "loss": 0.8084, "step": 6235 }, { "epoch": 2.5940254270337726, "grad_norm": 1.877063512802124, "learning_rate": 9.428216976428595e-07, "loss": 0.8354, "step": 6236 }, { "epoch": 2.5944414112263736, "grad_norm": 1.8544580936431885, "learning_rate": 9.409234883084372e-07, "loss": 0.7029, "step": 6237 }, { "epoch": 2.594857395418974, "grad_norm": 1.9354000091552734, "learning_rate": 9.390270974176785e-07, "loss": 0.846, "step": 6238 }, { "epoch": 2.595273379611575, "grad_norm": 1.9106214046478271, "learning_rate": 9.37132525351252e-07, "loss": 0.7296, "step": 6239 }, { "epoch": 2.5956893638041754, "grad_norm": 1.9630612134933472, "learning_rate": 9.352397724894524e-07, "loss": 0.8243, "step": 6240 }, { "epoch": 2.596105347996776, "grad_norm": 1.9442342519760132, "learning_rate": 9.333488392122214e-07, "loss": 0.7959, "step": 6241 }, { "epoch": 2.5965213321893765, "grad_norm": 2.0598151683807373, "learning_rate": 9.314597258991232e-07, "loss": 0.8425, "step": 6242 }, { "epoch": 2.5969373163819776, "grad_norm": 1.9138761758804321, "learning_rate": 9.295724329293687e-07, "loss": 0.7881, "step": 6243 }, { "epoch": 2.597353300574578, "grad_norm": 1.821704626083374, "learning_rate": 9.27686960681794e-07, "loss": 0.883, "step": 6244 }, { "epoch": 2.5977692847671787, "grad_norm": 15.340778350830078, "learning_rate": 9.258033095348729e-07, "loss": 0.7505, "step": 6245 }, { "epoch": 2.5981852689597797, "grad_norm": 1.6798988580703735, "learning_rate": 9.239214798667129e-07, "loss": 0.6817, "step": 6246 }, { "epoch": 2.5986012531523803, "grad_norm": 1.8755313158035278, "learning_rate": 9.220414720550597e-07, "loss": 0.7182, "step": 6247 }, { "epoch": 2.599017237344981, "grad_norm": 1.9005182981491089, "learning_rate": 9.20163286477288e-07, "loss": 0.849, "step": 6248 }, { "epoch": 2.5994332215375815, "grad_norm": 66.98941802978516, "learning_rate": 9.18286923510412e-07, "loss": 0.682, "step": 6249 }, { "epoch": 2.599849205730182, "grad_norm": 1.8979448080062866, "learning_rate": 9.164123835310734e-07, "loss": 0.7434, "step": 6250 }, { "epoch": 2.6002651899227827, "grad_norm": 71.09986114501953, "learning_rate": 9.14539666915556e-07, "loss": 0.6944, "step": 6251 }, { "epoch": 2.6006811741153837, "grad_norm": 2.026394844055176, "learning_rate": 9.126687740397688e-07, "loss": 0.8492, "step": 6252 }, { "epoch": 2.6010971583079843, "grad_norm": 1.7395511865615845, "learning_rate": 9.107997052792638e-07, "loss": 0.8523, "step": 6253 }, { "epoch": 2.601513142500585, "grad_norm": 1.8776451349258423, "learning_rate": 9.089324610092199e-07, "loss": 0.7808, "step": 6254 }, { "epoch": 2.601929126693186, "grad_norm": 1.8840359449386597, "learning_rate": 9.070670416044513e-07, "loss": 0.8373, "step": 6255 }, { "epoch": 2.6023451108857865, "grad_norm": 2.122730255126953, "learning_rate": 9.052034474394089e-07, "loss": 0.7745, "step": 6256 }, { "epoch": 2.602761095078387, "grad_norm": 1.737426519393921, "learning_rate": 9.033416788881732e-07, "loss": 0.7772, "step": 6257 }, { "epoch": 2.6031770792709876, "grad_norm": 1.9354064464569092, "learning_rate": 9.014817363244621e-07, "loss": 0.7946, "step": 6258 }, { "epoch": 2.603593063463588, "grad_norm": 2.0036067962646484, "learning_rate": 8.996236201216225e-07, "loss": 0.7853, "step": 6259 }, { "epoch": 2.604009047656189, "grad_norm": 2.055262327194214, "learning_rate": 8.977673306526402e-07, "loss": 0.8032, "step": 6260 }, { "epoch": 2.60442503184879, "grad_norm": 1.959583044052124, "learning_rate": 8.959128682901275e-07, "loss": 0.7971, "step": 6261 }, { "epoch": 2.6048410160413904, "grad_norm": 1.831544280052185, "learning_rate": 8.940602334063386e-07, "loss": 0.7155, "step": 6262 }, { "epoch": 2.605257000233991, "grad_norm": 1.7860941886901855, "learning_rate": 8.922094263731528e-07, "loss": 0.7824, "step": 6263 }, { "epoch": 2.605672984426592, "grad_norm": 1.9641551971435547, "learning_rate": 8.903604475620864e-07, "loss": 0.8257, "step": 6264 }, { "epoch": 2.6060889686191926, "grad_norm": 2.0080573558807373, "learning_rate": 8.885132973442867e-07, "loss": 0.7924, "step": 6265 }, { "epoch": 2.606504952811793, "grad_norm": 1.8462835550308228, "learning_rate": 8.866679760905372e-07, "loss": 0.8009, "step": 6266 }, { "epoch": 2.6069209370043938, "grad_norm": 1.9346318244934082, "learning_rate": 8.848244841712505e-07, "loss": 0.7721, "step": 6267 }, { "epoch": 2.6073369211969943, "grad_norm": 1.7945071458816528, "learning_rate": 8.829828219564751e-07, "loss": 0.7173, "step": 6268 }, { "epoch": 2.607752905389595, "grad_norm": 24.507369995117188, "learning_rate": 8.81142989815893e-07, "loss": 0.7935, "step": 6269 }, { "epoch": 2.608168889582196, "grad_norm": 1.9756617546081543, "learning_rate": 8.79304988118812e-07, "loss": 0.8406, "step": 6270 }, { "epoch": 2.6085848737747965, "grad_norm": 1.993420958518982, "learning_rate": 8.774688172341827e-07, "loss": 0.802, "step": 6271 }, { "epoch": 2.609000857967397, "grad_norm": 1.9220417737960815, "learning_rate": 8.756344775305792e-07, "loss": 0.7581, "step": 6272 }, { "epoch": 2.609416842159998, "grad_norm": 2.1329867839813232, "learning_rate": 8.738019693762123e-07, "loss": 0.7843, "step": 6273 }, { "epoch": 2.6098328263525987, "grad_norm": 2.0478553771972656, "learning_rate": 8.719712931389235e-07, "loss": 0.7762, "step": 6274 }, { "epoch": 2.6102488105451993, "grad_norm": 2.1337876319885254, "learning_rate": 8.701424491861887e-07, "loss": 0.8455, "step": 6275 }, { "epoch": 2.6106647947378, "grad_norm": 1.9115309715270996, "learning_rate": 8.683154378851144e-07, "loss": 0.8516, "step": 6276 }, { "epoch": 2.6110807789304005, "grad_norm": 1.8007296323776245, "learning_rate": 8.664902596024416e-07, "loss": 0.7428, "step": 6277 }, { "epoch": 2.611496763123001, "grad_norm": 208.94921875, "learning_rate": 8.646669147045372e-07, "loss": 0.7843, "step": 6278 }, { "epoch": 2.611912747315602, "grad_norm": 1.8393337726593018, "learning_rate": 8.628454035574085e-07, "loss": 0.8484, "step": 6279 }, { "epoch": 2.6123287315082027, "grad_norm": 1.8464826345443726, "learning_rate": 8.610257265266875e-07, "loss": 0.7224, "step": 6280 }, { "epoch": 2.6127447157008032, "grad_norm": 2.1162898540496826, "learning_rate": 8.592078839776441e-07, "loss": 0.85, "step": 6281 }, { "epoch": 2.6131606998934043, "grad_norm": 2.137375593185425, "learning_rate": 8.573918762751732e-07, "loss": 0.7668, "step": 6282 }, { "epoch": 2.613576684086005, "grad_norm": 1.8353873491287231, "learning_rate": 8.555777037838086e-07, "loss": 0.7668, "step": 6283 }, { "epoch": 2.6139926682786054, "grad_norm": 1.7992725372314453, "learning_rate": 8.537653668677104e-07, "loss": 0.8235, "step": 6284 }, { "epoch": 2.614408652471206, "grad_norm": 1.6808654069900513, "learning_rate": 8.519548658906718e-07, "loss": 0.8374, "step": 6285 }, { "epoch": 2.6148246366638066, "grad_norm": 3.811002254486084, "learning_rate": 8.501462012161188e-07, "loss": 0.623, "step": 6286 }, { "epoch": 2.615240620856407, "grad_norm": 2.084892511367798, "learning_rate": 8.483393732071055e-07, "loss": 0.8934, "step": 6287 }, { "epoch": 2.615656605049008, "grad_norm": 2065.45703125, "learning_rate": 8.465343822263228e-07, "loss": 0.706, "step": 6288 }, { "epoch": 2.616072589241609, "grad_norm": 5.662165641784668, "learning_rate": 8.447312286360853e-07, "loss": 0.7023, "step": 6289 }, { "epoch": 2.6164885734342094, "grad_norm": 1.8099925518035889, "learning_rate": 8.429299127983481e-07, "loss": 0.7376, "step": 6290 }, { "epoch": 2.6169045576268104, "grad_norm": 1.9903643131256104, "learning_rate": 8.411304350746885e-07, "loss": 0.8208, "step": 6291 }, { "epoch": 2.617320541819411, "grad_norm": 1.8983092308044434, "learning_rate": 8.393327958263209e-07, "loss": 0.8022, "step": 6292 }, { "epoch": 2.6177365260120116, "grad_norm": 1.9994182586669922, "learning_rate": 8.375369954140877e-07, "loss": 0.7613, "step": 6293 }, { "epoch": 2.618152510204612, "grad_norm": 1.9549551010131836, "learning_rate": 8.35743034198464e-07, "loss": 0.6814, "step": 6294 }, { "epoch": 2.6185684943972127, "grad_norm": 1.9528535604476929, "learning_rate": 8.339509125395507e-07, "loss": 0.7688, "step": 6295 }, { "epoch": 2.6189844785898133, "grad_norm": 1.9571977853775024, "learning_rate": 8.321606307970875e-07, "loss": 0.8767, "step": 6296 }, { "epoch": 2.6194004627824143, "grad_norm": 1.947855830192566, "learning_rate": 8.303721893304384e-07, "loss": 0.8013, "step": 6297 }, { "epoch": 2.619816446975015, "grad_norm": 1.8949849605560303, "learning_rate": 8.285855884986027e-07, "loss": 0.8461, "step": 6298 }, { "epoch": 2.6202324311676155, "grad_norm": 143729.328125, "learning_rate": 8.268008286602047e-07, "loss": 0.7363, "step": 6299 }, { "epoch": 2.6206484153602165, "grad_norm": 4.705626010894775, "learning_rate": 8.250179101735034e-07, "loss": 0.7982, "step": 6300 }, { "epoch": 2.621064399552817, "grad_norm": 1.884842872619629, "learning_rate": 8.232368333963892e-07, "loss": 0.6916, "step": 6301 }, { "epoch": 2.6214803837454177, "grad_norm": 2.001646041870117, "learning_rate": 8.214575986863793e-07, "loss": 0.8783, "step": 6302 }, { "epoch": 2.6218963679380183, "grad_norm": 1.7551521062850952, "learning_rate": 8.196802064006215e-07, "loss": 0.774, "step": 6303 }, { "epoch": 2.622312352130619, "grad_norm": 1.929038405418396, "learning_rate": 8.179046568958948e-07, "loss": 0.8367, "step": 6304 }, { "epoch": 2.6227283363232194, "grad_norm": 1.9193732738494873, "learning_rate": 8.161309505286097e-07, "loss": 0.7966, "step": 6305 }, { "epoch": 2.6231443205158205, "grad_norm": 6.672156810760498, "learning_rate": 8.143590876548024e-07, "loss": 0.8131, "step": 6306 }, { "epoch": 2.623560304708421, "grad_norm": 1.9144134521484375, "learning_rate": 8.125890686301452e-07, "loss": 0.7254, "step": 6307 }, { "epoch": 2.6239762889010216, "grad_norm": 1.9105955362319946, "learning_rate": 8.108208938099349e-07, "loss": 0.8066, "step": 6308 }, { "epoch": 2.6243922730936227, "grad_norm": 1.7594094276428223, "learning_rate": 8.090545635491021e-07, "loss": 0.7503, "step": 6309 }, { "epoch": 2.6248082572862232, "grad_norm": 1.803465723991394, "learning_rate": 8.07290078202202e-07, "loss": 0.6644, "step": 6310 }, { "epoch": 2.625224241478824, "grad_norm": 291.9000244140625, "learning_rate": 8.055274381234268e-07, "loss": 0.7393, "step": 6311 }, { "epoch": 2.6256402256714244, "grad_norm": 1.8618253469467163, "learning_rate": 8.037666436665925e-07, "loss": 0.8252, "step": 6312 }, { "epoch": 2.626056209864025, "grad_norm": 4.790866374969482, "learning_rate": 8.020076951851463e-07, "loss": 0.8094, "step": 6313 }, { "epoch": 2.6264721940566256, "grad_norm": 1.835374116897583, "learning_rate": 8.002505930321613e-07, "loss": 0.7248, "step": 6314 }, { "epoch": 2.6268881782492266, "grad_norm": 1.9771945476531982, "learning_rate": 7.984953375603488e-07, "loss": 0.9501, "step": 6315 }, { "epoch": 2.627304162441827, "grad_norm": 2.4038383960723877, "learning_rate": 7.967419291220424e-07, "loss": 0.6741, "step": 6316 }, { "epoch": 2.6277201466344278, "grad_norm": 2.0322537422180176, "learning_rate": 7.949903680692062e-07, "loss": 0.7951, "step": 6317 }, { "epoch": 2.628136130827029, "grad_norm": 2.017296314239502, "learning_rate": 7.932406547534355e-07, "loss": 0.8884, "step": 6318 }, { "epoch": 2.6285521150196294, "grad_norm": 1.7963522672653198, "learning_rate": 7.914927895259506e-07, "loss": 0.7352, "step": 6319 }, { "epoch": 2.62896809921223, "grad_norm": 2.071810007095337, "learning_rate": 7.897467727376062e-07, "loss": 0.853, "step": 6320 }, { "epoch": 2.6293840834048305, "grad_norm": 1.853104829788208, "learning_rate": 7.8800260473888e-07, "loss": 0.6819, "step": 6321 }, { "epoch": 2.629800067597431, "grad_norm": 1.9259151220321655, "learning_rate": 7.862602858798862e-07, "loss": 0.7286, "step": 6322 }, { "epoch": 2.6302160517900317, "grad_norm": 12.388479232788086, "learning_rate": 7.845198165103607e-07, "loss": 0.8154, "step": 6323 }, { "epoch": 2.6306320359826327, "grad_norm": 1.921330213546753, "learning_rate": 7.827811969796706e-07, "loss": 0.7825, "step": 6324 }, { "epoch": 2.6310480201752333, "grad_norm": 9245.947265625, "learning_rate": 7.810444276368101e-07, "loss": 0.7685, "step": 6325 }, { "epoch": 2.631464004367834, "grad_norm": 1.9462356567382812, "learning_rate": 7.793095088304093e-07, "loss": 0.7851, "step": 6326 }, { "epoch": 2.631879988560435, "grad_norm": 1.829573392868042, "learning_rate": 7.77576440908715e-07, "loss": 0.7166, "step": 6327 }, { "epoch": 2.6322959727530355, "grad_norm": 2.0201597213745117, "learning_rate": 7.758452242196146e-07, "loss": 0.8054, "step": 6328 }, { "epoch": 2.632711956945636, "grad_norm": 1.9439139366149902, "learning_rate": 7.741158591106124e-07, "loss": 0.7556, "step": 6329 }, { "epoch": 2.6331279411382367, "grad_norm": 1.758078694343567, "learning_rate": 7.723883459288518e-07, "loss": 0.7893, "step": 6330 }, { "epoch": 2.6335439253308373, "grad_norm": 1.8894460201263428, "learning_rate": 7.706626850210974e-07, "loss": 0.7436, "step": 6331 }, { "epoch": 2.633959909523438, "grad_norm": 1.8739959001541138, "learning_rate": 7.689388767337458e-07, "loss": 0.7562, "step": 6332 }, { "epoch": 2.634375893716039, "grad_norm": 1.971309781074524, "learning_rate": 7.672169214128167e-07, "loss": 0.8086, "step": 6333 }, { "epoch": 2.6347918779086394, "grad_norm": 1.9228452444076538, "learning_rate": 7.654968194039625e-07, "loss": 0.8015, "step": 6334 }, { "epoch": 2.63520786210124, "grad_norm": 1.913481593132019, "learning_rate": 7.637785710524626e-07, "loss": 0.8099, "step": 6335 }, { "epoch": 2.635623846293841, "grad_norm": 1.847710371017456, "learning_rate": 7.620621767032221e-07, "loss": 0.8647, "step": 6336 }, { "epoch": 2.6360398304864416, "grad_norm": 1.8285144567489624, "learning_rate": 7.603476367007801e-07, "loss": 0.8181, "step": 6337 }, { "epoch": 2.636455814679042, "grad_norm": 1.8204984664916992, "learning_rate": 7.586349513892932e-07, "loss": 0.5734, "step": 6338 }, { "epoch": 2.636871798871643, "grad_norm": 1.755560040473938, "learning_rate": 7.569241211125566e-07, "loss": 0.7298, "step": 6339 }, { "epoch": 2.6372877830642434, "grad_norm": 1.756558895111084, "learning_rate": 7.552151462139856e-07, "loss": 0.7024, "step": 6340 }, { "epoch": 2.637703767256844, "grad_norm": 2.06280779838562, "learning_rate": 7.535080270366268e-07, "loss": 0.8477, "step": 6341 }, { "epoch": 2.638119751449445, "grad_norm": 2.0412871837615967, "learning_rate": 7.518027639231518e-07, "loss": 0.6346, "step": 6342 }, { "epoch": 2.6385357356420456, "grad_norm": 2.003290891647339, "learning_rate": 7.500993572158632e-07, "loss": 0.7415, "step": 6343 }, { "epoch": 2.638951719834646, "grad_norm": 1.9243605136871338, "learning_rate": 7.483978072566844e-07, "loss": 0.758, "step": 6344 }, { "epoch": 2.639367704027247, "grad_norm": 2.682936906814575, "learning_rate": 7.466981143871732e-07, "loss": 0.7061, "step": 6345 }, { "epoch": 2.6397836882198478, "grad_norm": 1.932411789894104, "learning_rate": 7.450002789485133e-07, "loss": 0.7619, "step": 6346 }, { "epoch": 2.6401996724124484, "grad_norm": 1.9532768726348877, "learning_rate": 7.433043012815111e-07, "loss": 0.6527, "step": 6347 }, { "epoch": 2.640615656605049, "grad_norm": 1.7708042860031128, "learning_rate": 7.416101817266053e-07, "loss": 0.7133, "step": 6348 }, { "epoch": 2.6410316407976495, "grad_norm": 1.8292094469070435, "learning_rate": 7.399179206238571e-07, "loss": 0.7771, "step": 6349 }, { "epoch": 2.64144762499025, "grad_norm": 1.9231235980987549, "learning_rate": 7.382275183129605e-07, "loss": 0.8803, "step": 6350 }, { "epoch": 2.641863609182851, "grad_norm": 1.7941839694976807, "learning_rate": 7.365389751332308e-07, "loss": 0.7594, "step": 6351 }, { "epoch": 2.6422795933754517, "grad_norm": 2.0518405437469482, "learning_rate": 7.348522914236111e-07, "loss": 0.9236, "step": 6352 }, { "epoch": 2.6426955775680523, "grad_norm": 1.8425687551498413, "learning_rate": 7.331674675226719e-07, "loss": 0.7427, "step": 6353 }, { "epoch": 2.6431115617606533, "grad_norm": 2.0136964321136475, "learning_rate": 7.314845037686136e-07, "loss": 0.7243, "step": 6354 }, { "epoch": 2.643527545953254, "grad_norm": 2.1810412406921387, "learning_rate": 7.29803400499256e-07, "loss": 0.7572, "step": 6355 }, { "epoch": 2.6439435301458545, "grad_norm": 1.9349896907806396, "learning_rate": 7.281241580520549e-07, "loss": 0.7785, "step": 6356 }, { "epoch": 2.644359514338455, "grad_norm": 2.0078771114349365, "learning_rate": 7.264467767640826e-07, "loss": 0.8822, "step": 6357 }, { "epoch": 2.6447754985310556, "grad_norm": 1.932741403579712, "learning_rate": 7.247712569720478e-07, "loss": 0.7283, "step": 6358 }, { "epoch": 2.6451914827236562, "grad_norm": 1.797377586364746, "learning_rate": 7.230975990122757e-07, "loss": 0.7169, "step": 6359 }, { "epoch": 2.6456074669162573, "grad_norm": 1.845323920249939, "learning_rate": 7.214258032207233e-07, "loss": 0.7878, "step": 6360 }, { "epoch": 2.646023451108858, "grad_norm": 2.028562307357788, "learning_rate": 7.197558699329777e-07, "loss": 0.805, "step": 6361 }, { "epoch": 2.6464394353014584, "grad_norm": 1.8655188083648682, "learning_rate": 7.180877994842417e-07, "loss": 0.7382, "step": 6362 }, { "epoch": 2.6468554194940594, "grad_norm": 1.9893653392791748, "learning_rate": 7.164215922093531e-07, "loss": 0.8117, "step": 6363 }, { "epoch": 2.64727140368666, "grad_norm": 2.083449125289917, "learning_rate": 7.1475724844277e-07, "loss": 0.7923, "step": 6364 }, { "epoch": 2.6476873878792606, "grad_norm": 2.172511339187622, "learning_rate": 7.130947685185818e-07, "loss": 0.9624, "step": 6365 }, { "epoch": 2.648103372071861, "grad_norm": 1.9422926902770996, "learning_rate": 7.11434152770497e-07, "loss": 0.8815, "step": 6366 }, { "epoch": 2.6485193562644618, "grad_norm": 2.0946099758148193, "learning_rate": 7.097754015318592e-07, "loss": 0.8207, "step": 6367 }, { "epoch": 2.6489353404570624, "grad_norm": 1.8197758197784424, "learning_rate": 7.081185151356273e-07, "loss": 0.8552, "step": 6368 }, { "epoch": 2.6493513246496634, "grad_norm": 1.8577297925949097, "learning_rate": 7.064634939143933e-07, "loss": 0.7062, "step": 6369 }, { "epoch": 2.649767308842264, "grad_norm": 1.9174593687057495, "learning_rate": 7.048103382003713e-07, "loss": 0.7391, "step": 6370 }, { "epoch": 2.6501832930348646, "grad_norm": 1.8986101150512695, "learning_rate": 7.031590483254047e-07, "loss": 0.7647, "step": 6371 }, { "epoch": 2.6505992772274656, "grad_norm": 1.9139549732208252, "learning_rate": 7.015096246209574e-07, "loss": 0.7094, "step": 6372 }, { "epoch": 2.651015261420066, "grad_norm": 1.948304533958435, "learning_rate": 6.998620674181211e-07, "loss": 0.6777, "step": 6373 }, { "epoch": 2.6514312456126667, "grad_norm": 1.8441858291625977, "learning_rate": 6.982163770476125e-07, "loss": 0.777, "step": 6374 }, { "epoch": 2.6518472298052673, "grad_norm": 1.8637783527374268, "learning_rate": 6.965725538397727e-07, "loss": 0.7974, "step": 6375 }, { "epoch": 2.652263213997868, "grad_norm": 1.8303097486495972, "learning_rate": 6.949305981245724e-07, "loss": 0.6985, "step": 6376 }, { "epoch": 2.6526791981904685, "grad_norm": 1.7655537128448486, "learning_rate": 6.932905102316023e-07, "loss": 0.7186, "step": 6377 }, { "epoch": 2.6530951823830695, "grad_norm": 1.8054267168045044, "learning_rate": 6.916522904900802e-07, "loss": 0.733, "step": 6378 }, { "epoch": 2.65351116657567, "grad_norm": 1.93466055393219, "learning_rate": 6.900159392288475e-07, "loss": 0.9412, "step": 6379 }, { "epoch": 2.6539271507682707, "grad_norm": 2344.35400390625, "learning_rate": 6.883814567763746e-07, "loss": 0.7019, "step": 6380 }, { "epoch": 2.6543431349608717, "grad_norm": 2.0650737285614014, "learning_rate": 6.867488434607517e-07, "loss": 0.8712, "step": 6381 }, { "epoch": 2.6547591191534723, "grad_norm": 2.0871405601501465, "learning_rate": 6.851180996096962e-07, "loss": 0.7942, "step": 6382 }, { "epoch": 2.655175103346073, "grad_norm": 1.7096378803253174, "learning_rate": 6.834892255505499e-07, "loss": 0.7516, "step": 6383 }, { "epoch": 2.6555910875386735, "grad_norm": 1.8179116249084473, "learning_rate": 6.818622216102799e-07, "loss": 0.7731, "step": 6384 }, { "epoch": 2.656007071731274, "grad_norm": 1.7930830717086792, "learning_rate": 6.802370881154774e-07, "loss": 0.6974, "step": 6385 }, { "epoch": 2.6564230559238746, "grad_norm": 1.891518235206604, "learning_rate": 6.786138253923591e-07, "loss": 0.7984, "step": 6386 }, { "epoch": 2.6568390401164756, "grad_norm": 2.0088984966278076, "learning_rate": 6.769924337667622e-07, "loss": 0.8213, "step": 6387 }, { "epoch": 2.6572550243090762, "grad_norm": 1.944446086883545, "learning_rate": 6.753729135641551e-07, "loss": 0.7794, "step": 6388 }, { "epoch": 2.657671008501677, "grad_norm": 2.023498296737671, "learning_rate": 6.737552651096246e-07, "loss": 0.8374, "step": 6389 }, { "epoch": 2.658086992694278, "grad_norm": 2.144050359725952, "learning_rate": 6.721394887278854e-07, "loss": 0.8477, "step": 6390 }, { "epoch": 2.6585029768868784, "grad_norm": 2.0543720722198486, "learning_rate": 6.705255847432745e-07, "loss": 0.7189, "step": 6391 }, { "epoch": 2.658918961079479, "grad_norm": 2.0453732013702393, "learning_rate": 6.689135534797509e-07, "loss": 0.7125, "step": 6392 }, { "epoch": 2.6593349452720796, "grad_norm": 1.9866310358047485, "learning_rate": 6.673033952609054e-07, "loss": 0.7926, "step": 6393 }, { "epoch": 2.65975092946468, "grad_norm": 29.356998443603516, "learning_rate": 6.65695110409943e-07, "loss": 0.876, "step": 6394 }, { "epoch": 2.6601669136572808, "grad_norm": 2.045618772506714, "learning_rate": 6.640886992497009e-07, "loss": 0.8433, "step": 6395 }, { "epoch": 2.660582897849882, "grad_norm": 120.7717514038086, "learning_rate": 6.624841621026345e-07, "loss": 0.7389, "step": 6396 }, { "epoch": 2.6609988820424824, "grad_norm": 12.584959983825684, "learning_rate": 6.608814992908263e-07, "loss": 0.7747, "step": 6397 }, { "epoch": 2.661414866235083, "grad_norm": 1.9000314474105835, "learning_rate": 6.592807111359812e-07, "loss": 0.7906, "step": 6398 }, { "epoch": 2.661830850427684, "grad_norm": 1.8564742803573608, "learning_rate": 6.576817979594285e-07, "loss": 0.8305, "step": 6399 }, { "epoch": 2.6622468346202846, "grad_norm": 1.8785759210586548, "learning_rate": 6.560847600821197e-07, "loss": 0.7874, "step": 6400 }, { "epoch": 2.662662818812885, "grad_norm": 1.870309829711914, "learning_rate": 6.544895978246324e-07, "loss": 0.8082, "step": 6401 }, { "epoch": 2.6630788030054857, "grad_norm": 2.0088701248168945, "learning_rate": 6.528963115071663e-07, "loss": 0.7958, "step": 6402 }, { "epoch": 2.6634947871980863, "grad_norm": 1.9601503610610962, "learning_rate": 6.513049014495421e-07, "loss": 0.7333, "step": 6403 }, { "epoch": 2.663910771390687, "grad_norm": 1.9484819173812866, "learning_rate": 6.497153679712076e-07, "loss": 0.8518, "step": 6404 }, { "epoch": 2.664326755583288, "grad_norm": 28.521480560302734, "learning_rate": 6.481277113912332e-07, "loss": 0.8568, "step": 6405 }, { "epoch": 2.6647427397758885, "grad_norm": 2.19527268409729, "learning_rate": 6.465419320283095e-07, "loss": 0.8403, "step": 6406 }, { "epoch": 2.665158723968489, "grad_norm": 1.9322820901870728, "learning_rate": 6.449580302007553e-07, "loss": 0.7348, "step": 6407 }, { "epoch": 2.66557470816109, "grad_norm": 2.0576157569885254, "learning_rate": 6.433760062265104e-07, "loss": 0.8861, "step": 6408 }, { "epoch": 2.6659906923536907, "grad_norm": 1.795468807220459, "learning_rate": 6.417958604231334e-07, "loss": 0.7563, "step": 6409 }, { "epoch": 2.6664066765462913, "grad_norm": 1.8709028959274292, "learning_rate": 6.402175931078147e-07, "loss": 0.7393, "step": 6410 }, { "epoch": 2.666822660738892, "grad_norm": 1.8899823427200317, "learning_rate": 6.386412045973588e-07, "loss": 0.7587, "step": 6411 }, { "epoch": 2.6672386449314924, "grad_norm": 1.7401961088180542, "learning_rate": 6.370666952081983e-07, "loss": 0.7634, "step": 6412 }, { "epoch": 2.667654629124093, "grad_norm": 1.8957897424697876, "learning_rate": 6.354940652563845e-07, "loss": 0.7864, "step": 6413 }, { "epoch": 2.668070613316694, "grad_norm": 2.061638832092285, "learning_rate": 6.339233150575985e-07, "loss": 0.9719, "step": 6414 }, { "epoch": 2.6684865975092946, "grad_norm": 1.857013463973999, "learning_rate": 6.323544449271346e-07, "loss": 0.8269, "step": 6415 }, { "epoch": 2.668902581701895, "grad_norm": 1.930032730102539, "learning_rate": 6.307874551799198e-07, "loss": 0.7521, "step": 6416 }, { "epoch": 2.6693185658944962, "grad_norm": 1.8889058828353882, "learning_rate": 6.292223461304947e-07, "loss": 0.7369, "step": 6417 }, { "epoch": 2.669734550087097, "grad_norm": 1.9358983039855957, "learning_rate": 6.276591180930292e-07, "loss": 0.8186, "step": 6418 }, { "epoch": 2.6701505342796974, "grad_norm": 1.960425853729248, "learning_rate": 6.260977713813099e-07, "loss": 0.7838, "step": 6419 }, { "epoch": 2.670566518472298, "grad_norm": 1.739951491355896, "learning_rate": 6.245383063087518e-07, "loss": 0.7459, "step": 6420 }, { "epoch": 2.6709825026648986, "grad_norm": 2.028838872909546, "learning_rate": 6.229807231883867e-07, "loss": 0.8813, "step": 6421 }, { "epoch": 2.671398486857499, "grad_norm": 23.935409545898438, "learning_rate": 6.2142502233287e-07, "loss": 0.7796, "step": 6422 }, { "epoch": 2.6718144710501, "grad_norm": 1.9149857759475708, "learning_rate": 6.19871204054483e-07, "loss": 0.865, "step": 6423 }, { "epoch": 2.6722304552427008, "grad_norm": 1.8830790519714355, "learning_rate": 6.18319268665123e-07, "loss": 0.7878, "step": 6424 }, { "epoch": 2.6726464394353013, "grad_norm": 1.8109071254730225, "learning_rate": 6.167692164763162e-07, "loss": 0.7404, "step": 6425 }, { "epoch": 2.6730624236279024, "grad_norm": 1.951806902885437, "learning_rate": 6.152210477992026e-07, "loss": 0.8281, "step": 6426 }, { "epoch": 2.673478407820503, "grad_norm": 1.9325604438781738, "learning_rate": 6.136747629445538e-07, "loss": 0.8024, "step": 6427 }, { "epoch": 2.6738943920131035, "grad_norm": 1.9916211366653442, "learning_rate": 6.121303622227536e-07, "loss": 0.7526, "step": 6428 }, { "epoch": 2.674310376205704, "grad_norm": 1.9829905033111572, "learning_rate": 6.105878459438164e-07, "loss": 0.8263, "step": 6429 }, { "epoch": 2.6747263603983047, "grad_norm": 1.9174914360046387, "learning_rate": 6.090472144173721e-07, "loss": 0.8698, "step": 6430 }, { "epoch": 2.6751423445909053, "grad_norm": 1.9860646724700928, "learning_rate": 6.075084679526744e-07, "loss": 0.7725, "step": 6431 }, { "epoch": 2.6755583287835063, "grad_norm": 1.8067617416381836, "learning_rate": 6.059716068585953e-07, "loss": 0.8331, "step": 6432 }, { "epoch": 2.675974312976107, "grad_norm": 1.9189530611038208, "learning_rate": 6.044366314436368e-07, "loss": 0.7708, "step": 6433 }, { "epoch": 2.6763902971687075, "grad_norm": 2.078866958618164, "learning_rate": 6.029035420159135e-07, "loss": 0.7977, "step": 6434 }, { "epoch": 2.6768062813613085, "grad_norm": 2.1808292865753174, "learning_rate": 6.013723388831672e-07, "loss": 0.814, "step": 6435 }, { "epoch": 2.677222265553909, "grad_norm": 1.9545915126800537, "learning_rate": 5.998430223527574e-07, "loss": 0.8173, "step": 6436 }, { "epoch": 2.6776382497465097, "grad_norm": 2.03910231590271, "learning_rate": 5.983155927316653e-07, "loss": 0.8719, "step": 6437 }, { "epoch": 2.6780542339391102, "grad_norm": 1.9704091548919678, "learning_rate": 5.96790050326499e-07, "loss": 0.8229, "step": 6438 }, { "epoch": 2.678470218131711, "grad_norm": 2.023574113845825, "learning_rate": 5.952663954434779e-07, "loss": 0.7395, "step": 6439 }, { "epoch": 2.6788862023243114, "grad_norm": 1.8796924352645874, "learning_rate": 5.937446283884518e-07, "loss": 0.7671, "step": 6440 }, { "epoch": 2.6793021865169124, "grad_norm": 1.7766672372817993, "learning_rate": 5.922247494668853e-07, "loss": 0.7427, "step": 6441 }, { "epoch": 2.679718170709513, "grad_norm": 2.0067567825317383, "learning_rate": 5.907067589838678e-07, "loss": 0.7859, "step": 6442 }, { "epoch": 2.6801341549021136, "grad_norm": 86.3597412109375, "learning_rate": 5.891906572441042e-07, "loss": 0.8026, "step": 6443 }, { "epoch": 2.6805501390947146, "grad_norm": 4.096600532531738, "learning_rate": 5.876764445519301e-07, "loss": 0.8109, "step": 6444 }, { "epoch": 2.680966123287315, "grad_norm": 1.8463188409805298, "learning_rate": 5.861641212112901e-07, "loss": 0.7662, "step": 6445 }, { "epoch": 2.681382107479916, "grad_norm": 2.0058724880218506, "learning_rate": 5.846536875257613e-07, "loss": 0.8021, "step": 6446 }, { "epoch": 2.6817980916725164, "grad_norm": 1.9929187297821045, "learning_rate": 5.831451437985291e-07, "loss": 0.8911, "step": 6447 }, { "epoch": 2.682214075865117, "grad_norm": 1.9607970714569092, "learning_rate": 5.816384903324124e-07, "loss": 0.7484, "step": 6448 }, { "epoch": 2.6826300600577175, "grad_norm": 1.8086918592453003, "learning_rate": 5.801337274298391e-07, "loss": 0.7364, "step": 6449 }, { "epoch": 2.6830460442503186, "grad_norm": 1.9381351470947266, "learning_rate": 5.786308553928666e-07, "loss": 0.8833, "step": 6450 }, { "epoch": 2.683462028442919, "grad_norm": 155.24708557128906, "learning_rate": 5.771298745231679e-07, "loss": 0.775, "step": 6451 }, { "epoch": 2.6838780126355197, "grad_norm": 1.832846999168396, "learning_rate": 5.756307851220344e-07, "loss": 0.7896, "step": 6452 }, { "epoch": 2.6842939968281208, "grad_norm": 2.0411407947540283, "learning_rate": 5.741335874903853e-07, "loss": 0.7907, "step": 6453 }, { "epoch": 2.6847099810207213, "grad_norm": 1.8016698360443115, "learning_rate": 5.726382819287523e-07, "loss": 0.8323, "step": 6454 }, { "epoch": 2.685125965213322, "grad_norm": 9.715655326843262, "learning_rate": 5.711448687372933e-07, "loss": 0.8085, "step": 6455 }, { "epoch": 2.6855419494059225, "grad_norm": 1.7804067134857178, "learning_rate": 5.696533482157807e-07, "loss": 0.6099, "step": 6456 }, { "epoch": 2.685957933598523, "grad_norm": 1.914820909500122, "learning_rate": 5.681637206636125e-07, "loss": 0.9062, "step": 6457 }, { "epoch": 2.6863739177911237, "grad_norm": 1.959635615348816, "learning_rate": 5.666759863798021e-07, "loss": 0.8766, "step": 6458 }, { "epoch": 2.6867899019837247, "grad_norm": 204.11033630371094, "learning_rate": 5.651901456629871e-07, "loss": 0.6707, "step": 6459 }, { "epoch": 2.6872058861763253, "grad_norm": 1.9904242753982544, "learning_rate": 5.637061988114223e-07, "loss": 0.8225, "step": 6460 }, { "epoch": 2.687621870368926, "grad_norm": 1.8204290866851807, "learning_rate": 5.622241461229816e-07, "loss": 0.7323, "step": 6461 }, { "epoch": 2.688037854561527, "grad_norm": 1.9350494146347046, "learning_rate": 5.607439878951582e-07, "loss": 0.7911, "step": 6462 }, { "epoch": 2.6884538387541275, "grad_norm": 1.792510986328125, "learning_rate": 5.592657244250709e-07, "loss": 0.7742, "step": 6463 }, { "epoch": 2.688869822946728, "grad_norm": 1.8507262468338013, "learning_rate": 5.577893560094516e-07, "loss": 0.669, "step": 6464 }, { "epoch": 2.6892858071393286, "grad_norm": 1.9409852027893066, "learning_rate": 5.563148829446551e-07, "loss": 0.8169, "step": 6465 }, { "epoch": 2.689701791331929, "grad_norm": 1.7875577211380005, "learning_rate": 5.548423055266539e-07, "loss": 0.8128, "step": 6466 }, { "epoch": 2.69011777552453, "grad_norm": 1.906275987625122, "learning_rate": 5.533716240510412e-07, "loss": 0.7343, "step": 6467 }, { "epoch": 2.690533759717131, "grad_norm": 1.9978293180465698, "learning_rate": 5.519028388130321e-07, "loss": 0.7509, "step": 6468 }, { "epoch": 2.6909497439097314, "grad_norm": 1.8705873489379883, "learning_rate": 5.504359501074553e-07, "loss": 0.9379, "step": 6469 }, { "epoch": 2.691365728102332, "grad_norm": 1.9488205909729004, "learning_rate": 5.489709582287616e-07, "loss": 0.8376, "step": 6470 }, { "epoch": 2.691781712294933, "grad_norm": 1.8320097923278809, "learning_rate": 5.475078634710218e-07, "loss": 0.9155, "step": 6471 }, { "epoch": 2.6921976964875336, "grad_norm": 1.891096830368042, "learning_rate": 5.460466661279262e-07, "loss": 0.8389, "step": 6472 }, { "epoch": 2.692613680680134, "grad_norm": 198.0396270751953, "learning_rate": 5.445873664927825e-07, "loss": 0.7833, "step": 6473 }, { "epoch": 2.6930296648727348, "grad_norm": 1.9475308656692505, "learning_rate": 5.431299648585198e-07, "loss": 0.7939, "step": 6474 }, { "epoch": 2.6934456490653353, "grad_norm": 1.9637675285339355, "learning_rate": 5.416744615176828e-07, "loss": 0.8391, "step": 6475 }, { "epoch": 2.693861633257936, "grad_norm": 8.047320365905762, "learning_rate": 5.40220856762439e-07, "loss": 0.6796, "step": 6476 }, { "epoch": 2.694277617450537, "grad_norm": 1.8706790208816528, "learning_rate": 5.387691508845705e-07, "loss": 0.802, "step": 6477 }, { "epoch": 2.6946936016431375, "grad_norm": 1.8688842058181763, "learning_rate": 5.373193441754843e-07, "loss": 0.8286, "step": 6478 }, { "epoch": 2.695109585835738, "grad_norm": 1.8817553520202637, "learning_rate": 5.358714369261986e-07, "loss": 0.7547, "step": 6479 }, { "epoch": 2.695525570028339, "grad_norm": 69.98499298095703, "learning_rate": 5.344254294273576e-07, "loss": 0.7781, "step": 6480 }, { "epoch": 2.6959415542209397, "grad_norm": 2.027764081954956, "learning_rate": 5.329813219692193e-07, "loss": 0.9536, "step": 6481 }, { "epoch": 2.6963575384135403, "grad_norm": 2.0069069862365723, "learning_rate": 5.315391148416605e-07, "loss": 0.8697, "step": 6482 }, { "epoch": 2.696773522606141, "grad_norm": 1.9709230661392212, "learning_rate": 5.300988083341807e-07, "loss": 0.6938, "step": 6483 }, { "epoch": 2.6971895067987415, "grad_norm": 1.9749506711959839, "learning_rate": 5.286604027358921e-07, "loss": 0.7636, "step": 6484 }, { "epoch": 2.697605490991342, "grad_norm": 1.8944274187088013, "learning_rate": 5.272238983355315e-07, "loss": 0.6708, "step": 6485 }, { "epoch": 2.698021475183943, "grad_norm": 1.9692915678024292, "learning_rate": 5.257892954214472e-07, "loss": 0.7897, "step": 6486 }, { "epoch": 2.6984374593765437, "grad_norm": 1.9558252096176147, "learning_rate": 5.24356594281612e-07, "loss": 0.8088, "step": 6487 }, { "epoch": 2.6988534435691443, "grad_norm": 1.9387027025222778, "learning_rate": 5.229257952036138e-07, "loss": 0.7849, "step": 6488 }, { "epoch": 2.6992694277617453, "grad_norm": 2.0295350551605225, "learning_rate": 5.214968984746604e-07, "loss": 0.8811, "step": 6489 }, { "epoch": 2.699685411954346, "grad_norm": 1.8347759246826172, "learning_rate": 5.200699043815749e-07, "loss": 0.8083, "step": 6490 }, { "epoch": 2.7001013961469464, "grad_norm": 1.9759104251861572, "learning_rate": 5.186448132108013e-07, "loss": 0.8192, "step": 6491 }, { "epoch": 2.700517380339547, "grad_norm": 1.9430350065231323, "learning_rate": 5.172216252483975e-07, "loss": 0.8542, "step": 6492 }, { "epoch": 2.7009333645321476, "grad_norm": 1.9063969850540161, "learning_rate": 5.158003407800483e-07, "loss": 0.7181, "step": 6493 }, { "epoch": 2.701349348724748, "grad_norm": 1.8109276294708252, "learning_rate": 5.143809600910444e-07, "loss": 0.7735, "step": 6494 }, { "epoch": 2.701765332917349, "grad_norm": 2.057941436767578, "learning_rate": 5.129634834663044e-07, "loss": 0.9064, "step": 6495 }, { "epoch": 2.70218131710995, "grad_norm": 2.096590280532837, "learning_rate": 5.115479111903599e-07, "loss": 0.8577, "step": 6496 }, { "epoch": 2.7025973013025504, "grad_norm": 1.9651691913604736, "learning_rate": 5.1013424354736e-07, "loss": 0.7671, "step": 6497 }, { "epoch": 2.7030132854951514, "grad_norm": 1.9537928104400635, "learning_rate": 5.087224808210745e-07, "loss": 0.8023, "step": 6498 }, { "epoch": 2.703429269687752, "grad_norm": 1.8356126546859741, "learning_rate": 5.073126232948877e-07, "loss": 0.8426, "step": 6499 }, { "epoch": 2.7038452538803526, "grad_norm": 2.0219528675079346, "learning_rate": 5.059046712518034e-07, "loss": 0.8408, "step": 6500 }, { "epoch": 2.7038452538803526, "eval_loss": 0.7520768642425537, "eval_runtime": 1863.2184, "eval_samples_per_second": 3.537, "eval_steps_per_second": 1.769, "step": 6500 }, { "epoch": 2.704261238072953, "grad_norm": 2.1464428901672363, "learning_rate": 5.044986249744399e-07, "loss": 0.8437, "step": 6501 }, { "epoch": 2.7046772222655537, "grad_norm": 1.8237895965576172, "learning_rate": 5.030944847450381e-07, "loss": 0.7889, "step": 6502 }, { "epoch": 2.7050932064581543, "grad_norm": 7.667016506195068, "learning_rate": 5.016922508454514e-07, "loss": 0.7547, "step": 6503 }, { "epoch": 2.7055091906507553, "grad_norm": 1.8925992250442505, "learning_rate": 5.002919235571546e-07, "loss": 0.7097, "step": 6504 }, { "epoch": 2.705925174843356, "grad_norm": 1.9647178649902344, "learning_rate": 4.98893503161234e-07, "loss": 0.7011, "step": 6505 }, { "epoch": 2.7063411590359565, "grad_norm": 2.194051504135132, "learning_rate": 4.97496989938403e-07, "loss": 0.8057, "step": 6506 }, { "epoch": 2.7067571432285575, "grad_norm": 1.8752590417861938, "learning_rate": 4.961023841689793e-07, "loss": 0.7322, "step": 6507 }, { "epoch": 2.707173127421158, "grad_norm": 61.47467803955078, "learning_rate": 4.947096861329104e-07, "loss": 0.7281, "step": 6508 }, { "epoch": 2.7075891116137587, "grad_norm": 1.851729393005371, "learning_rate": 4.933188961097513e-07, "loss": 0.7038, "step": 6509 }, { "epoch": 2.7080050958063593, "grad_norm": 1.9516932964324951, "learning_rate": 4.919300143786787e-07, "loss": 0.828, "step": 6510 }, { "epoch": 2.70842107999896, "grad_norm": 2.06684947013855, "learning_rate": 4.905430412184831e-07, "loss": 0.86, "step": 6511 }, { "epoch": 2.7088370641915605, "grad_norm": 2.042044162750244, "learning_rate": 4.891579769075772e-07, "loss": 0.7542, "step": 6512 }, { "epoch": 2.7092530483841615, "grad_norm": 1.9275176525115967, "learning_rate": 4.877748217239864e-07, "loss": 0.679, "step": 6513 }, { "epoch": 2.709669032576762, "grad_norm": 1.867665410041809, "learning_rate": 4.86393575945352e-07, "loss": 0.8417, "step": 6514 }, { "epoch": 2.7100850167693626, "grad_norm": 1.8135194778442383, "learning_rate": 4.850142398489365e-07, "loss": 0.8099, "step": 6515 }, { "epoch": 2.7105010009619637, "grad_norm": 314.76806640625, "learning_rate": 4.836368137116132e-07, "loss": 0.9458, "step": 6516 }, { "epoch": 2.7109169851545643, "grad_norm": 1.8182249069213867, "learning_rate": 4.822612978098784e-07, "loss": 0.6532, "step": 6517 }, { "epoch": 2.711332969347165, "grad_norm": 1.9558666944503784, "learning_rate": 4.808876924198391e-07, "loss": 0.7722, "step": 6518 }, { "epoch": 2.7117489535397654, "grad_norm": 1.6468709707260132, "learning_rate": 4.795159978172237e-07, "loss": 0.6386, "step": 6519 }, { "epoch": 2.712164937732366, "grad_norm": 2.0103414058685303, "learning_rate": 4.78146214277373e-07, "loss": 0.8335, "step": 6520 }, { "epoch": 2.7125809219249666, "grad_norm": 1.8193342685699463, "learning_rate": 4.76778342075247e-07, "loss": 0.6841, "step": 6521 }, { "epoch": 2.7129969061175676, "grad_norm": 1.8852275609970093, "learning_rate": 4.7541238148541944e-07, "loss": 0.7988, "step": 6522 }, { "epoch": 2.713412890310168, "grad_norm": 1.8896139860153198, "learning_rate": 4.740483327820844e-07, "loss": 0.7868, "step": 6523 }, { "epoch": 2.7138288745027688, "grad_norm": 1.8880767822265625, "learning_rate": 4.726861962390472e-07, "loss": 0.735, "step": 6524 }, { "epoch": 2.71424485869537, "grad_norm": 1.8681061267852783, "learning_rate": 4.7132597212973366e-07, "loss": 0.8378, "step": 6525 }, { "epoch": 2.7146608428879704, "grad_norm": 1.8805161714553833, "learning_rate": 4.699676607271808e-07, "loss": 0.8002, "step": 6526 }, { "epoch": 2.715076827080571, "grad_norm": 2.1510417461395264, "learning_rate": 4.686112623040495e-07, "loss": 0.9555, "step": 6527 }, { "epoch": 2.7154928112731715, "grad_norm": 336.0035705566406, "learning_rate": 4.6725677713260753e-07, "loss": 0.8354, "step": 6528 }, { "epoch": 2.715908795465772, "grad_norm": 1.9607871770858765, "learning_rate": 4.6590420548474734e-07, "loss": 0.7571, "step": 6529 }, { "epoch": 2.7163247796583727, "grad_norm": 1.9888272285461426, "learning_rate": 4.645535476319696e-07, "loss": 0.795, "step": 6530 }, { "epoch": 2.7167407638509737, "grad_norm": 1.8025621175765991, "learning_rate": 4.632048038453929e-07, "loss": 0.7239, "step": 6531 }, { "epoch": 2.7171567480435743, "grad_norm": 1.9490097761154175, "learning_rate": 4.618579743957563e-07, "loss": 0.7514, "step": 6532 }, { "epoch": 2.717572732236175, "grad_norm": 2.0840237140655518, "learning_rate": 4.605130595534091e-07, "loss": 0.8695, "step": 6533 }, { "epoch": 2.717988716428776, "grad_norm": 1.955286979675293, "learning_rate": 4.591700595883208e-07, "loss": 0.7717, "step": 6534 }, { "epoch": 2.7184047006213765, "grad_norm": 1.9728305339813232, "learning_rate": 4.5782897477006926e-07, "loss": 0.7707, "step": 6535 }, { "epoch": 2.718820684813977, "grad_norm": 1.8198391199111938, "learning_rate": 4.564898053678579e-07, "loss": 0.7992, "step": 6536 }, { "epoch": 2.7192366690065777, "grad_norm": 1.844071388244629, "learning_rate": 4.551525516504973e-07, "loss": 0.7601, "step": 6537 }, { "epoch": 2.7196526531991783, "grad_norm": 1.9432878494262695, "learning_rate": 4.538172138864183e-07, "loss": 0.7559, "step": 6538 }, { "epoch": 2.720068637391779, "grad_norm": 1.8054085969924927, "learning_rate": 4.524837923436642e-07, "loss": 0.7199, "step": 6539 }, { "epoch": 2.72048462158438, "grad_norm": 2.016303539276123, "learning_rate": 4.5115228728989655e-07, "loss": 0.8048, "step": 6540 }, { "epoch": 2.7209006057769805, "grad_norm": 1.7219024896621704, "learning_rate": 4.4982269899238706e-07, "loss": 0.7539, "step": 6541 }, { "epoch": 2.721316589969581, "grad_norm": 2.0138065814971924, "learning_rate": 4.484950277180311e-07, "loss": 0.9249, "step": 6542 }, { "epoch": 2.721732574162182, "grad_norm": 15.185884475708008, "learning_rate": 4.4716927373332995e-07, "loss": 0.718, "step": 6543 }, { "epoch": 2.7221485583547826, "grad_norm": 1.969486117362976, "learning_rate": 4.4584543730440633e-07, "loss": 0.7206, "step": 6544 }, { "epoch": 2.7225645425473832, "grad_norm": 1.9457406997680664, "learning_rate": 4.445235186969976e-07, "loss": 0.8818, "step": 6545 }, { "epoch": 2.722980526739984, "grad_norm": 35.2348747253418, "learning_rate": 4.432035181764516e-07, "loss": 0.6434, "step": 6546 }, { "epoch": 2.7233965109325844, "grad_norm": 1.7866394519805908, "learning_rate": 4.418854360077385e-07, "loss": 0.8543, "step": 6547 }, { "epoch": 2.723812495125185, "grad_norm": 1.8621840476989746, "learning_rate": 4.4056927245543557e-07, "loss": 0.836, "step": 6548 }, { "epoch": 2.724228479317786, "grad_norm": 1.9379196166992188, "learning_rate": 4.392550277837404e-07, "loss": 0.8338, "step": 6549 }, { "epoch": 2.7246444635103866, "grad_norm": 1.8193104267120361, "learning_rate": 4.3794270225646085e-07, "loss": 0.7942, "step": 6550 }, { "epoch": 2.725060447702987, "grad_norm": 10.767245292663574, "learning_rate": 4.366322961370262e-07, "loss": 0.8026, "step": 6551 }, { "epoch": 2.725476431895588, "grad_norm": 1.885407567024231, "learning_rate": 4.3532380968847157e-07, "loss": 0.7535, "step": 6552 }, { "epoch": 2.7258924160881888, "grad_norm": 1.9600608348846436, "learning_rate": 4.3401724317345686e-07, "loss": 0.8485, "step": 6553 }, { "epoch": 2.7263084002807894, "grad_norm": 1.9714140892028809, "learning_rate": 4.327125968542478e-07, "loss": 0.6848, "step": 6554 }, { "epoch": 2.72672438447339, "grad_norm": 2.004045248031616, "learning_rate": 4.314098709927306e-07, "loss": 0.7932, "step": 6555 }, { "epoch": 2.7271403686659905, "grad_norm": 1.8199748992919922, "learning_rate": 4.3010906585040035e-07, "loss": 0.7514, "step": 6556 }, { "epoch": 2.727556352858591, "grad_norm": 2.0503320693969727, "learning_rate": 4.2881018168837384e-07, "loss": 0.7975, "step": 6557 }, { "epoch": 2.727972337051192, "grad_norm": 2.019944906234741, "learning_rate": 4.275132187673758e-07, "loss": 0.7968, "step": 6558 }, { "epoch": 2.7283883212437927, "grad_norm": 2.0862789154052734, "learning_rate": 4.262181773477492e-07, "loss": 0.8391, "step": 6559 }, { "epoch": 2.7288043054363933, "grad_norm": 4.531859874725342, "learning_rate": 4.2492505768944813e-07, "loss": 0.8067, "step": 6560 }, { "epoch": 2.7292202896289943, "grad_norm": 1.7449856996536255, "learning_rate": 4.236338600520429e-07, "loss": 0.6658, "step": 6561 }, { "epoch": 2.729636273821595, "grad_norm": 2.1060009002685547, "learning_rate": 4.223445846947205e-07, "loss": 0.8527, "step": 6562 }, { "epoch": 2.7300522580141955, "grad_norm": 3.6473324298858643, "learning_rate": 4.210572318762751e-07, "loss": 0.6575, "step": 6563 }, { "epoch": 2.730468242206796, "grad_norm": 3.4589526653289795, "learning_rate": 4.1977180185512225e-07, "loss": 0.8077, "step": 6564 }, { "epoch": 2.7308842263993967, "grad_norm": 1.9734768867492676, "learning_rate": 4.1848829488928765e-07, "loss": 0.8009, "step": 6565 }, { "epoch": 2.7313002105919972, "grad_norm": 2497.643798828125, "learning_rate": 4.1720671123641197e-07, "loss": 0.7083, "step": 6566 }, { "epoch": 2.7317161947845983, "grad_norm": 1.8079315423965454, "learning_rate": 4.1592705115374833e-07, "loss": 0.79, "step": 6567 }, { "epoch": 2.732132178977199, "grad_norm": 1.8263740539550781, "learning_rate": 4.146493148981678e-07, "loss": 0.7884, "step": 6568 }, { "epoch": 2.7325481631697994, "grad_norm": 2.111785888671875, "learning_rate": 4.1337350272615097e-07, "loss": 0.8393, "step": 6569 }, { "epoch": 2.7329641473624005, "grad_norm": 1.9545687437057495, "learning_rate": 4.120996148937928e-07, "loss": 0.9614, "step": 6570 }, { "epoch": 2.733380131555001, "grad_norm": 1.9671823978424072, "learning_rate": 4.1082765165680325e-07, "loss": 0.7663, "step": 6571 }, { "epoch": 2.7337961157476016, "grad_norm": 1.78791081905365, "learning_rate": 4.095576132705081e-07, "loss": 0.9069, "step": 6572 }, { "epoch": 2.734212099940202, "grad_norm": 2.0309720039367676, "learning_rate": 4.0828949998984016e-07, "loss": 0.8548, "step": 6573 }, { "epoch": 2.734628084132803, "grad_norm": 2.0413897037506104, "learning_rate": 4.070233120693534e-07, "loss": 0.7806, "step": 6574 }, { "epoch": 2.7350440683254034, "grad_norm": 1.9314733743667603, "learning_rate": 4.0575904976321135e-07, "loss": 0.713, "step": 6575 }, { "epoch": 2.7354600525180044, "grad_norm": 1.822161078453064, "learning_rate": 4.044967133251887e-07, "loss": 0.7223, "step": 6576 }, { "epoch": 2.735876036710605, "grad_norm": 1.8315175771713257, "learning_rate": 4.0323630300868054e-07, "loss": 0.6772, "step": 6577 }, { "epoch": 2.7362920209032056, "grad_norm": 1.8257851600646973, "learning_rate": 4.01977819066689e-07, "loss": 0.7905, "step": 6578 }, { "epoch": 2.7367080050958066, "grad_norm": 2.734286069869995, "learning_rate": 4.007212617518319e-07, "loss": 0.7878, "step": 6579 }, { "epoch": 2.737123989288407, "grad_norm": 2.7323215007781982, "learning_rate": 3.9946663131633867e-07, "loss": 0.7379, "step": 6580 }, { "epoch": 2.7375399734810077, "grad_norm": 1.9377222061157227, "learning_rate": 3.982139280120556e-07, "loss": 0.916, "step": 6581 }, { "epoch": 2.7379559576736083, "grad_norm": 1.9883919954299927, "learning_rate": 3.969631520904382e-07, "loss": 0.8654, "step": 6582 }, { "epoch": 2.738371941866209, "grad_norm": 1.9634063243865967, "learning_rate": 3.9571430380255793e-07, "loss": 0.8608, "step": 6583 }, { "epoch": 2.7387879260588095, "grad_norm": 1.7255265712738037, "learning_rate": 3.944673833990975e-07, "loss": 0.8324, "step": 6584 }, { "epoch": 2.7392039102514105, "grad_norm": 1.9783936738967896, "learning_rate": 3.9322239113035563e-07, "loss": 0.7831, "step": 6585 }, { "epoch": 2.739619894444011, "grad_norm": 1.9837169647216797, "learning_rate": 3.9197932724623797e-07, "loss": 0.8064, "step": 6586 }, { "epoch": 2.7400358786366117, "grad_norm": 1.7481440305709839, "learning_rate": 3.9073819199626937e-07, "loss": 0.6754, "step": 6587 }, { "epoch": 2.7404518628292127, "grad_norm": 1.864162564277649, "learning_rate": 3.89498985629585e-07, "loss": 0.7777, "step": 6588 }, { "epoch": 2.7408678470218133, "grad_norm": 2.0428688526153564, "learning_rate": 3.882617083949303e-07, "loss": 0.7881, "step": 6589 }, { "epoch": 2.741283831214414, "grad_norm": 2.0409090518951416, "learning_rate": 3.870263605406699e-07, "loss": 0.791, "step": 6590 }, { "epoch": 2.7416998154070145, "grad_norm": 2.026740312576294, "learning_rate": 3.857929423147744e-07, "loss": 0.8216, "step": 6591 }, { "epoch": 2.742115799599615, "grad_norm": 2.042891025543213, "learning_rate": 3.845614539648312e-07, "loss": 0.8218, "step": 6592 }, { "epoch": 2.7425317837922156, "grad_norm": 1.9902997016906738, "learning_rate": 3.833318957380372e-07, "loss": 0.8518, "step": 6593 }, { "epoch": 2.7429477679848167, "grad_norm": 2.04658579826355, "learning_rate": 3.821042678812059e-07, "loss": 0.8718, "step": 6594 }, { "epoch": 2.7433637521774172, "grad_norm": 1.890672206878662, "learning_rate": 3.8087857064076026e-07, "loss": 0.7532, "step": 6595 }, { "epoch": 2.743779736370018, "grad_norm": 1.9966264963150024, "learning_rate": 3.796548042627368e-07, "loss": 0.7617, "step": 6596 }, { "epoch": 2.744195720562619, "grad_norm": 1.8709810972213745, "learning_rate": 3.784329689927835e-07, "loss": 0.6939, "step": 6597 }, { "epoch": 2.7446117047552194, "grad_norm": 1.967759609222412, "learning_rate": 3.7721306507616184e-07, "loss": 0.785, "step": 6598 }, { "epoch": 2.74502768894782, "grad_norm": 1.9179518222808838, "learning_rate": 3.7599509275774606e-07, "loss": 0.8038, "step": 6599 }, { "epoch": 2.7454436731404206, "grad_norm": 1.9071786403656006, "learning_rate": 3.747790522820216e-07, "loss": 0.723, "step": 6600 }, { "epoch": 2.745859657333021, "grad_norm": 1.9199429750442505, "learning_rate": 3.7356494389308327e-07, "loss": 0.7478, "step": 6601 }, { "epoch": 2.7462756415256218, "grad_norm": 37.99307632446289, "learning_rate": 3.7235276783464504e-07, "loss": 0.739, "step": 6602 }, { "epoch": 2.746691625718223, "grad_norm": 1.9404168128967285, "learning_rate": 3.7114252435002665e-07, "loss": 0.8413, "step": 6603 }, { "epoch": 2.7471076099108234, "grad_norm": 2.057636260986328, "learning_rate": 3.6993421368216266e-07, "loss": 0.8667, "step": 6604 }, { "epoch": 2.747523594103424, "grad_norm": 1.8649201393127441, "learning_rate": 3.687278360736013e-07, "loss": 0.7249, "step": 6605 }, { "epoch": 2.747939578296025, "grad_norm": 1.8620212078094482, "learning_rate": 3.6752339176649776e-07, "loss": 0.8799, "step": 6606 }, { "epoch": 2.7483555624886256, "grad_norm": 1.938763976097107, "learning_rate": 3.663208810026253e-07, "loss": 0.8605, "step": 6607 }, { "epoch": 2.748771546681226, "grad_norm": 2.0716333389282227, "learning_rate": 3.651203040233642e-07, "loss": 0.8393, "step": 6608 }, { "epoch": 2.7491875308738267, "grad_norm": 1.7515548467636108, "learning_rate": 3.6392166106970825e-07, "loss": 0.7045, "step": 6609 }, { "epoch": 2.7496035150664273, "grad_norm": 1.960561990737915, "learning_rate": 3.627249523822618e-07, "loss": 0.7674, "step": 6610 }, { "epoch": 2.750019499259028, "grad_norm": 2.0993192195892334, "learning_rate": 3.615301782012448e-07, "loss": 0.8545, "step": 6611 }, { "epoch": 2.750435483451629, "grad_norm": 1.7989436388015747, "learning_rate": 3.6033733876648434e-07, "loss": 0.7673, "step": 6612 }, { "epoch": 2.7508514676442295, "grad_norm": 1.791203498840332, "learning_rate": 3.5914643431742223e-07, "loss": 0.7883, "step": 6613 }, { "epoch": 2.75126745183683, "grad_norm": 1.841638445854187, "learning_rate": 3.579574650931106e-07, "loss": 0.785, "step": 6614 }, { "epoch": 2.751683436029431, "grad_norm": 1.9212803840637207, "learning_rate": 3.567704313322129e-07, "loss": 0.7657, "step": 6615 }, { "epoch": 2.7520994202220317, "grad_norm": 2.1156513690948486, "learning_rate": 3.5558533327300304e-07, "loss": 0.8263, "step": 6616 }, { "epoch": 2.7525154044146323, "grad_norm": 1.998921513557434, "learning_rate": 3.544021711533718e-07, "loss": 0.8405, "step": 6617 }, { "epoch": 2.752931388607233, "grad_norm": 1.8435063362121582, "learning_rate": 3.532209452108137e-07, "loss": 0.7276, "step": 6618 }, { "epoch": 2.7533473727998334, "grad_norm": 1.8464001417160034, "learning_rate": 3.5204165568243797e-07, "loss": 0.7505, "step": 6619 }, { "epoch": 2.753763356992434, "grad_norm": 2.0628623962402344, "learning_rate": 3.5086430280496743e-07, "loss": 0.694, "step": 6620 }, { "epoch": 2.754179341185035, "grad_norm": 1.9908859729766846, "learning_rate": 3.49688886814733e-07, "loss": 0.781, "step": 6621 }, { "epoch": 2.7545953253776356, "grad_norm": 22.122173309326172, "learning_rate": 3.485154079476794e-07, "loss": 0.7646, "step": 6622 }, { "epoch": 2.755011309570236, "grad_norm": 1.7064114809036255, "learning_rate": 3.4734386643935803e-07, "loss": 0.8415, "step": 6623 }, { "epoch": 2.7554272937628372, "grad_norm": 1.8540633916854858, "learning_rate": 3.461742625249376e-07, "loss": 0.736, "step": 6624 }, { "epoch": 2.755843277955438, "grad_norm": 1.765580177307129, "learning_rate": 3.4500659643919244e-07, "loss": 0.786, "step": 6625 }, { "epoch": 2.7562592621480384, "grad_norm": 2.013685941696167, "learning_rate": 3.438408684165118e-07, "loss": 0.8559, "step": 6626 }, { "epoch": 2.756675246340639, "grad_norm": 1.9712550640106201, "learning_rate": 3.4267707869089284e-07, "loss": 0.7549, "step": 6627 }, { "epoch": 2.7570912305332396, "grad_norm": 4.7544965744018555, "learning_rate": 3.4151522749594547e-07, "loss": 0.7823, "step": 6628 }, { "epoch": 2.75750721472584, "grad_norm": 1.9709224700927734, "learning_rate": 3.403553150648897e-07, "loss": 0.7196, "step": 6629 }, { "epoch": 2.757923198918441, "grad_norm": 1.9346919059753418, "learning_rate": 3.391973416305583e-07, "loss": 0.7894, "step": 6630 }, { "epoch": 2.7583391831110418, "grad_norm": 1.7635709047317505, "learning_rate": 3.3804130742538964e-07, "loss": 0.7262, "step": 6631 }, { "epoch": 2.7587551673036423, "grad_norm": 2.050182580947876, "learning_rate": 3.3688721268144155e-07, "loss": 0.881, "step": 6632 }, { "epoch": 2.7591711514962434, "grad_norm": 1.9824237823486328, "learning_rate": 3.3573505763037307e-07, "loss": 0.8917, "step": 6633 }, { "epoch": 2.759587135688844, "grad_norm": 1.977213740348816, "learning_rate": 3.3458484250345924e-07, "loss": 0.7318, "step": 6634 }, { "epoch": 2.7600031198814445, "grad_norm": 16.73761558532715, "learning_rate": 3.334365675315865e-07, "loss": 0.703, "step": 6635 }, { "epoch": 2.760419104074045, "grad_norm": 1.819855809211731, "learning_rate": 3.322902329452493e-07, "loss": 0.9124, "step": 6636 }, { "epoch": 2.7608350882666457, "grad_norm": 1.8037114143371582, "learning_rate": 3.311458389745514e-07, "loss": 0.6921, "step": 6637 }, { "epoch": 2.7612510724592463, "grad_norm": 2.0590221881866455, "learning_rate": 3.300033858492113e-07, "loss": 0.8143, "step": 6638 }, { "epoch": 2.7616670566518473, "grad_norm": 23.913877487182617, "learning_rate": 3.2886287379855443e-07, "loss": 0.825, "step": 6639 }, { "epoch": 2.762083040844448, "grad_norm": 1.8725701570510864, "learning_rate": 3.2772430305151646e-07, "loss": 0.7493, "step": 6640 }, { "epoch": 2.7624990250370485, "grad_norm": 79.30085754394531, "learning_rate": 3.26587673836648e-07, "loss": 0.8325, "step": 6641 }, { "epoch": 2.7629150092296495, "grad_norm": 1.9447096586227417, "learning_rate": 3.254529863821021e-07, "loss": 0.7874, "step": 6642 }, { "epoch": 2.76333099342225, "grad_norm": 1.8490709066390991, "learning_rate": 3.2432024091565093e-07, "loss": 0.7303, "step": 6643 }, { "epoch": 2.7637469776148507, "grad_norm": 1.8434654474258423, "learning_rate": 3.231894376646694e-07, "loss": 0.7211, "step": 6644 }, { "epoch": 2.7641629618074512, "grad_norm": 2.009141206741333, "learning_rate": 3.220605768561469e-07, "loss": 0.8872, "step": 6645 }, { "epoch": 2.764578946000052, "grad_norm": 1.9110713005065918, "learning_rate": 3.2093365871668e-07, "loss": 0.8584, "step": 6646 }, { "epoch": 2.7649949301926524, "grad_norm": 2.029538154602051, "learning_rate": 3.1980868347248005e-07, "loss": 0.8034, "step": 6647 }, { "epoch": 2.7654109143852534, "grad_norm": 2.0344130992889404, "learning_rate": 3.186856513493619e-07, "loss": 0.8676, "step": 6648 }, { "epoch": 2.765826898577854, "grad_norm": 1.9358927011489868, "learning_rate": 3.1756456257275523e-07, "loss": 0.7658, "step": 6649 }, { "epoch": 2.7662428827704546, "grad_norm": 1.9871211051940918, "learning_rate": 3.1644541736769785e-07, "loss": 0.7966, "step": 6650 }, { "epoch": 2.7666588669630556, "grad_norm": 1.8928602933883667, "learning_rate": 3.153282159588367e-07, "loss": 0.7896, "step": 6651 }, { "epoch": 2.767074851155656, "grad_norm": 1.925321340560913, "learning_rate": 3.142129585704323e-07, "loss": 0.8362, "step": 6652 }, { "epoch": 2.767490835348257, "grad_norm": 2.43564772605896, "learning_rate": 3.1309964542634906e-07, "loss": 0.7765, "step": 6653 }, { "epoch": 2.7679068195408574, "grad_norm": 2.064241886138916, "learning_rate": 3.119882767500659e-07, "loss": 0.7179, "step": 6654 }, { "epoch": 2.768322803733458, "grad_norm": 1.7982605695724487, "learning_rate": 3.108788527646678e-07, "loss": 0.7006, "step": 6655 }, { "epoch": 2.7687387879260585, "grad_norm": 1.9763518571853638, "learning_rate": 3.0977137369285427e-07, "loss": 0.7859, "step": 6656 }, { "epoch": 2.7691547721186596, "grad_norm": 1.881667137145996, "learning_rate": 3.086658397569298e-07, "loss": 0.8786, "step": 6657 }, { "epoch": 2.76957075631126, "grad_norm": 1.8633394241333008, "learning_rate": 3.0756225117881013e-07, "loss": 0.7177, "step": 6658 }, { "epoch": 2.7699867405038607, "grad_norm": 1.8513721227645874, "learning_rate": 3.064606081800203e-07, "loss": 0.7299, "step": 6659 }, { "epoch": 2.7704027246964618, "grad_norm": 31.03015899658203, "learning_rate": 3.053609109816957e-07, "loss": 0.784, "step": 6660 }, { "epoch": 2.7708187088890623, "grad_norm": 1.903464436531067, "learning_rate": 3.0426315980457756e-07, "loss": 0.8523, "step": 6661 }, { "epoch": 2.771234693081663, "grad_norm": 2.1976232528686523, "learning_rate": 3.0316735486902395e-07, "loss": 0.9355, "step": 6662 }, { "epoch": 2.7716506772742635, "grad_norm": 6.915736198425293, "learning_rate": 3.020734963949945e-07, "loss": 0.8361, "step": 6663 }, { "epoch": 2.772066661466864, "grad_norm": 2.038557291030884, "learning_rate": 3.009815846020625e-07, "loss": 0.8341, "step": 6664 }, { "epoch": 2.7724826456594647, "grad_norm": 2.4988179206848145, "learning_rate": 2.9989161970940816e-07, "loss": 0.7912, "step": 6665 }, { "epoch": 2.7728986298520657, "grad_norm": 1.8608283996582031, "learning_rate": 2.9880360193582316e-07, "loss": 0.8127, "step": 6666 }, { "epoch": 2.7733146140446663, "grad_norm": 406.7161865234375, "learning_rate": 2.977175314997072e-07, "loss": 0.8001, "step": 6667 }, { "epoch": 2.773730598237267, "grad_norm": 1.8482811450958252, "learning_rate": 2.9663340861906695e-07, "loss": 0.7887, "step": 6668 }, { "epoch": 2.774146582429868, "grad_norm": 1.7579914331436157, "learning_rate": 2.9555123351152293e-07, "loss": 0.6252, "step": 6669 }, { "epoch": 2.7745625666224685, "grad_norm": 1.8717665672302246, "learning_rate": 2.9447100639429904e-07, "loss": 0.7264, "step": 6670 }, { "epoch": 2.774978550815069, "grad_norm": 1.9215325117111206, "learning_rate": 2.9339272748423407e-07, "loss": 0.7179, "step": 6671 }, { "epoch": 2.7753945350076696, "grad_norm": 1.792466163635254, "learning_rate": 2.923163969977716e-07, "loss": 0.7738, "step": 6672 }, { "epoch": 2.77581051920027, "grad_norm": 1.8371695280075073, "learning_rate": 2.912420151509654e-07, "loss": 0.6676, "step": 6673 }, { "epoch": 2.776226503392871, "grad_norm": 1.8455594778060913, "learning_rate": 2.901695821594763e-07, "loss": 0.7791, "step": 6674 }, { "epoch": 2.776642487585472, "grad_norm": 1.9159154891967773, "learning_rate": 2.8909909823857776e-07, "loss": 0.8583, "step": 6675 }, { "epoch": 2.7770584717780724, "grad_norm": 1.817833423614502, "learning_rate": 2.8803056360314886e-07, "loss": 0.8087, "step": 6676 }, { "epoch": 2.777474455970673, "grad_norm": 1.8581374883651733, "learning_rate": 2.8696397846767807e-07, "loss": 0.7976, "step": 6677 }, { "epoch": 2.777890440163274, "grad_norm": 2.0492491722106934, "learning_rate": 2.858993430462653e-07, "loss": 0.881, "step": 6678 }, { "epoch": 2.7783064243558746, "grad_norm": 1.81434965133667, "learning_rate": 2.848366575526129e-07, "loss": 0.8143, "step": 6679 }, { "epoch": 2.778722408548475, "grad_norm": 1.6892757415771484, "learning_rate": 2.8377592220003694e-07, "loss": 0.6368, "step": 6680 }, { "epoch": 2.7791383927410758, "grad_norm": 1.8440824747085571, "learning_rate": 2.827171372014603e-07, "loss": 0.8699, "step": 6681 }, { "epoch": 2.7795543769336764, "grad_norm": 2.0442938804626465, "learning_rate": 2.816603027694165e-07, "loss": 0.8098, "step": 6682 }, { "epoch": 2.779970361126277, "grad_norm": 1.844142198562622, "learning_rate": 2.8060541911604344e-07, "loss": 0.7804, "step": 6683 }, { "epoch": 2.780386345318878, "grad_norm": 87.96157836914062, "learning_rate": 2.7955248645309185e-07, "loss": 0.73, "step": 6684 }, { "epoch": 2.7808023295114785, "grad_norm": 1.9650086164474487, "learning_rate": 2.78501504991916e-07, "loss": 1.0046, "step": 6685 }, { "epoch": 2.781218313704079, "grad_norm": 1.9152464866638184, "learning_rate": 2.774524749434848e-07, "loss": 0.8138, "step": 6686 }, { "epoch": 2.78163429789668, "grad_norm": 1.9104095697402954, "learning_rate": 2.7640539651836886e-07, "loss": 0.7194, "step": 6687 }, { "epoch": 2.7820502820892807, "grad_norm": 1.8050415515899658, "learning_rate": 2.753602699267499e-07, "loss": 0.7759, "step": 6688 }, { "epoch": 2.7824662662818813, "grad_norm": 1.8292604684829712, "learning_rate": 2.743170953784191e-07, "loss": 0.7189, "step": 6689 }, { "epoch": 2.782882250474482, "grad_norm": 1.928424596786499, "learning_rate": 2.732758730827745e-07, "loss": 0.7662, "step": 6690 }, { "epoch": 2.7832982346670825, "grad_norm": 63.40231704711914, "learning_rate": 2.722366032488222e-07, "loss": 0.765, "step": 6691 }, { "epoch": 2.783714218859683, "grad_norm": 1.8602410554885864, "learning_rate": 2.711992860851775e-07, "loss": 0.7474, "step": 6692 }, { "epoch": 2.784130203052284, "grad_norm": 1.6873213052749634, "learning_rate": 2.701639218000607e-07, "loss": 0.708, "step": 6693 }, { "epoch": 2.7845461872448847, "grad_norm": 14.946191787719727, "learning_rate": 2.691305106013042e-07, "loss": 0.8826, "step": 6694 }, { "epoch": 2.7849621714374853, "grad_norm": 1.8837469816207886, "learning_rate": 2.680990526963445e-07, "loss": 0.7623, "step": 6695 }, { "epoch": 2.7853781556300863, "grad_norm": 1.9185855388641357, "learning_rate": 2.670695482922303e-07, "loss": 0.7372, "step": 6696 }, { "epoch": 2.785794139822687, "grad_norm": 1.809888243675232, "learning_rate": 2.660419975956152e-07, "loss": 0.7426, "step": 6697 }, { "epoch": 2.7862101240152874, "grad_norm": 1.884757399559021, "learning_rate": 2.650164008127587e-07, "loss": 0.8101, "step": 6698 }, { "epoch": 2.786626108207888, "grad_norm": 1.9510070085525513, "learning_rate": 2.6399275814953384e-07, "loss": 0.9032, "step": 6699 }, { "epoch": 2.7870420924004886, "grad_norm": 5.0794358253479, "learning_rate": 2.629710698114152e-07, "loss": 0.8352, "step": 6700 }, { "epoch": 2.787458076593089, "grad_norm": 1.9250569343566895, "learning_rate": 2.6195133600349085e-07, "loss": 0.8018, "step": 6701 }, { "epoch": 2.7878740607856902, "grad_norm": 1.901564121246338, "learning_rate": 2.6093355693045053e-07, "loss": 0.9215, "step": 6702 }, { "epoch": 2.788290044978291, "grad_norm": 2.8249940872192383, "learning_rate": 2.599177327965985e-07, "loss": 0.7539, "step": 6703 }, { "epoch": 2.7887060291708914, "grad_norm": 1.7773611545562744, "learning_rate": 2.589038638058394e-07, "loss": 0.791, "step": 6704 }, { "epoch": 2.7891220133634924, "grad_norm": 2.070042371749878, "learning_rate": 2.5789195016169056e-07, "loss": 0.7745, "step": 6705 }, { "epoch": 2.789537997556093, "grad_norm": 1.945723295211792, "learning_rate": 2.5688199206727494e-07, "loss": 0.8181, "step": 6706 }, { "epoch": 2.7899539817486936, "grad_norm": 1.949903964996338, "learning_rate": 2.5587398972532373e-07, "loss": 0.8087, "step": 6707 }, { "epoch": 2.790369965941294, "grad_norm": 1.7319964170455933, "learning_rate": 2.548679433381729e-07, "loss": 0.6862, "step": 6708 }, { "epoch": 2.7907859501338947, "grad_norm": 1.9861962795257568, "learning_rate": 2.5386385310777083e-07, "loss": 0.8588, "step": 6709 }, { "epoch": 2.7912019343264953, "grad_norm": 1.9435147047042847, "learning_rate": 2.5286171923566637e-07, "loss": 0.8612, "step": 6710 }, { "epoch": 2.7916179185190964, "grad_norm": 6.6860246658325195, "learning_rate": 2.518615419230219e-07, "loss": 0.7791, "step": 6711 }, { "epoch": 2.792033902711697, "grad_norm": 1.8947694301605225, "learning_rate": 2.5086332137060687e-07, "loss": 0.7794, "step": 6712 }, { "epoch": 2.7924498869042975, "grad_norm": 2.015454053878784, "learning_rate": 2.4986705777879096e-07, "loss": 0.6917, "step": 6713 }, { "epoch": 2.7928658710968985, "grad_norm": 1.9731967449188232, "learning_rate": 2.4887275134755975e-07, "loss": 0.8601, "step": 6714 }, { "epoch": 2.793281855289499, "grad_norm": 1.839648723602295, "learning_rate": 2.4788040227650024e-07, "loss": 0.7433, "step": 6715 }, { "epoch": 2.7936978394820997, "grad_norm": 1.9933117628097534, "learning_rate": 2.468900107648087e-07, "loss": 0.8438, "step": 6716 }, { "epoch": 2.7941138236747003, "grad_norm": 1.946275234222412, "learning_rate": 2.4590157701128825e-07, "loss": 0.7749, "step": 6717 }, { "epoch": 2.794529807867301, "grad_norm": 4.858270168304443, "learning_rate": 2.44915101214348e-07, "loss": 0.8355, "step": 6718 }, { "epoch": 2.7949457920599015, "grad_norm": 1.8968079090118408, "learning_rate": 2.4393058357200626e-07, "loss": 0.7046, "step": 6719 }, { "epoch": 2.7953617762525025, "grad_norm": 2.135399341583252, "learning_rate": 2.42948024281886e-07, "loss": 0.8648, "step": 6720 }, { "epoch": 2.795777760445103, "grad_norm": 1.7786314487457275, "learning_rate": 2.4196742354121727e-07, "loss": 0.836, "step": 6721 }, { "epoch": 2.7961937446377036, "grad_norm": 1.9556100368499756, "learning_rate": 2.4098878154684036e-07, "loss": 0.7256, "step": 6722 }, { "epoch": 2.7966097288303047, "grad_norm": 1.9221725463867188, "learning_rate": 2.400120984951959e-07, "loss": 0.873, "step": 6723 }, { "epoch": 2.7970257130229053, "grad_norm": 2.0491695404052734, "learning_rate": 2.3903737458233934e-07, "loss": 0.7947, "step": 6724 }, { "epoch": 2.797441697215506, "grad_norm": 1.9589470624923706, "learning_rate": 2.3806461000392523e-07, "loss": 0.8126, "step": 6725 }, { "epoch": 2.7978576814081064, "grad_norm": 1.88896644115448, "learning_rate": 2.3709380495522073e-07, "loss": 0.9261, "step": 6726 }, { "epoch": 2.798273665600707, "grad_norm": 2.03401780128479, "learning_rate": 2.361249596310955e-07, "loss": 0.8555, "step": 6727 }, { "epoch": 2.7986896497933076, "grad_norm": 1.7020947933197021, "learning_rate": 2.3515807422602732e-07, "loss": 0.7141, "step": 6728 }, { "epoch": 2.7991056339859086, "grad_norm": 1.9383527040481567, "learning_rate": 2.3419314893410205e-07, "loss": 0.8049, "step": 6729 }, { "epoch": 2.799521618178509, "grad_norm": 1.847956657409668, "learning_rate": 2.332301839490092e-07, "loss": 0.7763, "step": 6730 }, { "epoch": 2.79993760237111, "grad_norm": 1.9092862606048584, "learning_rate": 2.3226917946404858e-07, "loss": 0.8349, "step": 6731 }, { "epoch": 2.800353586563711, "grad_norm": 1.8545244932174683, "learning_rate": 2.3131013567212145e-07, "loss": 0.6931, "step": 6732 }, { "epoch": 2.8007695707563114, "grad_norm": 1.944386601448059, "learning_rate": 2.303530527657405e-07, "loss": 0.764, "step": 6733 }, { "epoch": 2.801185554948912, "grad_norm": 1.84477961063385, "learning_rate": 2.29397930937022e-07, "loss": 0.689, "step": 6734 }, { "epoch": 2.8016015391415126, "grad_norm": 1.8328640460968018, "learning_rate": 2.284447703776893e-07, "loss": 0.7331, "step": 6735 }, { "epoch": 2.802017523334113, "grad_norm": 1.9095425605773926, "learning_rate": 2.2749357127907157e-07, "loss": 0.8827, "step": 6736 }, { "epoch": 2.8024335075267137, "grad_norm": 4.451007843017578, "learning_rate": 2.2654433383210495e-07, "loss": 0.7594, "step": 6737 }, { "epoch": 2.8028494917193147, "grad_norm": 1.9768973588943481, "learning_rate": 2.2559705822732925e-07, "loss": 0.8287, "step": 6738 }, { "epoch": 2.8032654759119153, "grad_norm": 1.8190131187438965, "learning_rate": 2.246517446548968e-07, "loss": 0.7753, "step": 6739 }, { "epoch": 2.803681460104516, "grad_norm": 1.7221065759658813, "learning_rate": 2.2370839330455806e-07, "loss": 0.6999, "step": 6740 }, { "epoch": 2.804097444297117, "grad_norm": 2.0826363563537598, "learning_rate": 2.22767004365676e-07, "loss": 0.8351, "step": 6741 }, { "epoch": 2.8045134284897175, "grad_norm": 1.867059588432312, "learning_rate": 2.218275780272172e-07, "loss": 0.8548, "step": 6742 }, { "epoch": 2.804929412682318, "grad_norm": 2.7166402339935303, "learning_rate": 2.20890114477752e-07, "loss": 0.8013, "step": 6743 }, { "epoch": 2.8053453968749187, "grad_norm": 1.8988498449325562, "learning_rate": 2.1995461390546203e-07, "loss": 0.744, "step": 6744 }, { "epoch": 2.8057613810675193, "grad_norm": 1.8706917762756348, "learning_rate": 2.190210764981304e-07, "loss": 0.7726, "step": 6745 }, { "epoch": 2.80617736526012, "grad_norm": 1.9690955877304077, "learning_rate": 2.180895024431473e-07, "loss": 0.7317, "step": 6746 }, { "epoch": 2.806593349452721, "grad_norm": 1.8408349752426147, "learning_rate": 2.1715989192750975e-07, "loss": 0.7334, "step": 6747 }, { "epoch": 2.8070093336453215, "grad_norm": 1.9080181121826172, "learning_rate": 2.162322451378196e-07, "loss": 0.7051, "step": 6748 }, { "epoch": 2.807425317837922, "grad_norm": 2.000444173812866, "learning_rate": 2.1530656226028458e-07, "loss": 0.7772, "step": 6749 }, { "epoch": 2.807841302030523, "grad_norm": 2.081270217895508, "learning_rate": 2.1438284348072048e-07, "loss": 0.8699, "step": 6750 }, { "epoch": 2.8082572862231236, "grad_norm": 1.7683948278427124, "learning_rate": 2.1346108898454455e-07, "loss": 0.8848, "step": 6751 }, { "epoch": 2.8086732704157242, "grad_norm": 1.969684362411499, "learning_rate": 2.1254129895678432e-07, "loss": 0.7912, "step": 6752 }, { "epoch": 2.809089254608325, "grad_norm": 1.9271893501281738, "learning_rate": 2.116234735820688e-07, "loss": 0.9288, "step": 6753 }, { "epoch": 2.8095052388009254, "grad_norm": 2.05057430267334, "learning_rate": 2.1070761304463616e-07, "loss": 0.899, "step": 6754 }, { "epoch": 2.809921222993526, "grad_norm": 1.818846344947815, "learning_rate": 2.0979371752832712e-07, "loss": 0.6875, "step": 6755 }, { "epoch": 2.810337207186127, "grad_norm": 1.9877716302871704, "learning_rate": 2.088817872165916e-07, "loss": 0.8399, "step": 6756 }, { "epoch": 2.8107531913787276, "grad_norm": 1.9206337928771973, "learning_rate": 2.0797182229248092e-07, "loss": 0.7866, "step": 6757 }, { "epoch": 2.811169175571328, "grad_norm": 1.859308123588562, "learning_rate": 2.0706382293865345e-07, "loss": 0.7665, "step": 6758 }, { "epoch": 2.811585159763929, "grad_norm": 1.9315234422683716, "learning_rate": 2.0615778933737562e-07, "loss": 0.8691, "step": 6759 }, { "epoch": 2.81200114395653, "grad_norm": 1.8656507730484009, "learning_rate": 2.0525372167051638e-07, "loss": 0.731, "step": 6760 }, { "epoch": 2.8124171281491304, "grad_norm": 1.7998526096343994, "learning_rate": 2.0435162011954946e-07, "loss": 0.6448, "step": 6761 }, { "epoch": 2.812833112341731, "grad_norm": 1.9872761964797974, "learning_rate": 2.0345148486555665e-07, "loss": 0.7763, "step": 6762 }, { "epoch": 2.8132490965343315, "grad_norm": 1.8667539358139038, "learning_rate": 2.0255331608922347e-07, "loss": 0.8125, "step": 6763 }, { "epoch": 2.813665080726932, "grad_norm": 1.8112905025482178, "learning_rate": 2.01657113970839e-07, "loss": 0.725, "step": 6764 }, { "epoch": 2.814081064919533, "grad_norm": 2.134489059448242, "learning_rate": 2.0076287869030265e-07, "loss": 0.858, "step": 6765 }, { "epoch": 2.8144970491121337, "grad_norm": 2.1291401386260986, "learning_rate": 1.998706104271131e-07, "loss": 0.8116, "step": 6766 }, { "epoch": 2.8149130333047343, "grad_norm": 2.0309016704559326, "learning_rate": 1.9898030936037814e-07, "loss": 0.7832, "step": 6767 }, { "epoch": 2.8153290174973353, "grad_norm": 1.8838427066802979, "learning_rate": 1.9809197566880823e-07, "loss": 0.7133, "step": 6768 }, { "epoch": 2.815745001689936, "grad_norm": 1.8798556327819824, "learning_rate": 1.9720560953072288e-07, "loss": 0.6678, "step": 6769 }, { "epoch": 2.8161609858825365, "grad_norm": 1.8557507991790771, "learning_rate": 1.963212111240398e-07, "loss": 0.7927, "step": 6770 }, { "epoch": 2.816576970075137, "grad_norm": 1.9982128143310547, "learning_rate": 1.9543878062628695e-07, "loss": 0.8826, "step": 6771 }, { "epoch": 2.8169929542677377, "grad_norm": 1.8676708936691284, "learning_rate": 1.9455831821459937e-07, "loss": 0.7649, "step": 6772 }, { "epoch": 2.8174089384603382, "grad_norm": 1.9236382246017456, "learning_rate": 1.9367982406571006e-07, "loss": 0.8118, "step": 6773 }, { "epoch": 2.8178249226529393, "grad_norm": 2.0596282482147217, "learning_rate": 1.9280329835596245e-07, "loss": 0.7832, "step": 6774 }, { "epoch": 2.81824090684554, "grad_norm": 1.8843599557876587, "learning_rate": 1.9192874126130245e-07, "loss": 0.7824, "step": 6775 }, { "epoch": 2.8186568910381404, "grad_norm": 1.9317604303359985, "learning_rate": 1.9105615295728186e-07, "loss": 0.7618, "step": 6776 }, { "epoch": 2.8190728752307415, "grad_norm": 2.054607391357422, "learning_rate": 1.9018553361905502e-07, "loss": 0.8767, "step": 6777 }, { "epoch": 2.819488859423342, "grad_norm": 4.642929553985596, "learning_rate": 1.8931688342138544e-07, "loss": 0.7942, "step": 6778 }, { "epoch": 2.8199048436159426, "grad_norm": 46.31842041015625, "learning_rate": 1.884502025386359e-07, "loss": 0.7905, "step": 6779 }, { "epoch": 2.820320827808543, "grad_norm": 2.092374324798584, "learning_rate": 1.8758549114477943e-07, "loss": 0.7949, "step": 6780 }, { "epoch": 2.820736812001144, "grad_norm": 1.8250850439071655, "learning_rate": 1.8672274941338941e-07, "loss": 0.7621, "step": 6781 }, { "epoch": 2.8211527961937444, "grad_norm": 2.0331203937530518, "learning_rate": 1.8586197751764624e-07, "loss": 0.8215, "step": 6782 }, { "epoch": 2.8215687803863454, "grad_norm": 1.761401891708374, "learning_rate": 1.8500317563033275e-07, "loss": 0.7956, "step": 6783 }, { "epoch": 2.821984764578946, "grad_norm": 2.1459617614746094, "learning_rate": 1.8414634392384e-07, "loss": 0.804, "step": 6784 }, { "epoch": 2.8224007487715466, "grad_norm": 1.6130247116088867, "learning_rate": 1.8329148257015926e-07, "loss": 0.5831, "step": 6785 }, { "epoch": 2.8228167329641476, "grad_norm": 1.9833133220672607, "learning_rate": 1.8243859174088996e-07, "loss": 0.8086, "step": 6786 }, { "epoch": 2.823232717156748, "grad_norm": 1.9135156869888306, "learning_rate": 1.8158767160723177e-07, "loss": 0.8218, "step": 6787 }, { "epoch": 2.8236487013493488, "grad_norm": 2.0241565704345703, "learning_rate": 1.8073872233999258e-07, "loss": 0.8551, "step": 6788 }, { "epoch": 2.8240646855419493, "grad_norm": 1.9564666748046875, "learning_rate": 1.7989174410958487e-07, "loss": 0.9374, "step": 6789 }, { "epoch": 2.82448066973455, "grad_norm": 1.6696473360061646, "learning_rate": 1.790467370860216e-07, "loss": 0.7985, "step": 6790 }, { "epoch": 2.8248966539271505, "grad_norm": 1.8525938987731934, "learning_rate": 1.7820370143892485e-07, "loss": 0.7116, "step": 6791 }, { "epoch": 2.8253126381197515, "grad_norm": 1.8861860036849976, "learning_rate": 1.77362637337517e-07, "loss": 0.7568, "step": 6792 }, { "epoch": 2.825728622312352, "grad_norm": 1.8358997106552124, "learning_rate": 1.7652354495062641e-07, "loss": 0.8153, "step": 6793 }, { "epoch": 2.8261446065049527, "grad_norm": 1.8226648569107056, "learning_rate": 1.7568642444668714e-07, "loss": 0.7845, "step": 6794 }, { "epoch": 2.8265605906975537, "grad_norm": 1.9535059928894043, "learning_rate": 1.7485127599373265e-07, "loss": 0.7965, "step": 6795 }, { "epoch": 2.8269765748901543, "grad_norm": 2.093294620513916, "learning_rate": 1.7401809975940652e-07, "loss": 0.8879, "step": 6796 }, { "epoch": 2.827392559082755, "grad_norm": 1.9942244291305542, "learning_rate": 1.731868959109517e-07, "loss": 0.8555, "step": 6797 }, { "epoch": 2.8278085432753555, "grad_norm": 1.9834060668945312, "learning_rate": 1.7235766461521807e-07, "loss": 0.7664, "step": 6798 }, { "epoch": 2.828224527467956, "grad_norm": 1.7804607152938843, "learning_rate": 1.715304060386591e-07, "loss": 0.694, "step": 6799 }, { "epoch": 2.8286405116605566, "grad_norm": 1.8729828596115112, "learning_rate": 1.707051203473309e-07, "loss": 0.7135, "step": 6800 }, { "epoch": 2.8290564958531577, "grad_norm": 1.8252861499786377, "learning_rate": 1.6988180770689423e-07, "loss": 0.8046, "step": 6801 }, { "epoch": 2.8294724800457582, "grad_norm": 1.9947272539138794, "learning_rate": 1.690604682826147e-07, "loss": 0.8445, "step": 6802 }, { "epoch": 2.829888464238359, "grad_norm": 1.8867837190628052, "learning_rate": 1.682411022393615e-07, "loss": 0.9044, "step": 6803 }, { "epoch": 2.83030444843096, "grad_norm": 2.0052289962768555, "learning_rate": 1.674237097416076e-07, "loss": 0.797, "step": 6804 }, { "epoch": 2.8307204326235604, "grad_norm": 2.009321451187134, "learning_rate": 1.6660829095342833e-07, "loss": 0.8375, "step": 6805 }, { "epoch": 2.831136416816161, "grad_norm": 1.8982104063034058, "learning_rate": 1.6579484603850393e-07, "loss": 0.6402, "step": 6806 }, { "epoch": 2.8315524010087616, "grad_norm": 1.8995088338851929, "learning_rate": 1.649833751601193e-07, "loss": 0.7306, "step": 6807 }, { "epoch": 2.831968385201362, "grad_norm": 1.8035622835159302, "learning_rate": 1.6417387848116305e-07, "loss": 0.7843, "step": 6808 }, { "epoch": 2.8323843693939628, "grad_norm": 1.7887476682662964, "learning_rate": 1.633663561641252e-07, "loss": 0.7514, "step": 6809 }, { "epoch": 2.832800353586564, "grad_norm": 128.93792724609375, "learning_rate": 1.6256080837110278e-07, "loss": 0.8563, "step": 6810 }, { "epoch": 2.8332163377791644, "grad_norm": 1.852123498916626, "learning_rate": 1.6175723526379305e-07, "loss": 0.7734, "step": 6811 }, { "epoch": 2.833632321971765, "grad_norm": 2.0170738697052, "learning_rate": 1.6095563700350037e-07, "loss": 0.8655, "step": 6812 }, { "epoch": 2.834048306164366, "grad_norm": 1.993208646774292, "learning_rate": 1.6015601375112821e-07, "loss": 0.7344, "step": 6813 }, { "epoch": 2.8344642903569666, "grad_norm": 1.7924449443817139, "learning_rate": 1.5935836566718931e-07, "loss": 0.7361, "step": 6814 }, { "epoch": 2.834880274549567, "grad_norm": 1.8699852228164673, "learning_rate": 1.585626929117967e-07, "loss": 0.8086, "step": 6815 }, { "epoch": 2.8352962587421677, "grad_norm": 1.8846288919448853, "learning_rate": 1.5776899564466486e-07, "loss": 0.7558, "step": 6816 }, { "epoch": 2.8357122429347683, "grad_norm": 1.8677363395690918, "learning_rate": 1.5697727402511408e-07, "loss": 0.7921, "step": 6817 }, { "epoch": 2.836128227127369, "grad_norm": 1.9072697162628174, "learning_rate": 1.561875282120706e-07, "loss": 0.8646, "step": 6818 }, { "epoch": 2.83654421131997, "grad_norm": 1.879290223121643, "learning_rate": 1.5539975836405984e-07, "loss": 0.7707, "step": 6819 }, { "epoch": 2.8369601955125705, "grad_norm": 2.141340970993042, "learning_rate": 1.5461396463921086e-07, "loss": 0.942, "step": 6820 }, { "epoch": 2.837376179705171, "grad_norm": 109.22256469726562, "learning_rate": 1.5383014719525968e-07, "loss": 0.8128, "step": 6821 }, { "epoch": 2.837792163897772, "grad_norm": 2.0253384113311768, "learning_rate": 1.530483061895427e-07, "loss": 0.825, "step": 6822 }, { "epoch": 2.8382081480903727, "grad_norm": 1.7585399150848389, "learning_rate": 1.522684417789988e-07, "loss": 0.7712, "step": 6823 }, { "epoch": 2.8386241322829733, "grad_norm": 1.9443343877792358, "learning_rate": 1.5149055412017387e-07, "loss": 0.8141, "step": 6824 }, { "epoch": 2.839040116475574, "grad_norm": 2.002716541290283, "learning_rate": 1.5071464336921304e-07, "loss": 0.8241, "step": 6825 }, { "epoch": 2.8394561006681744, "grad_norm": 1.8407341241836548, "learning_rate": 1.4994070968186503e-07, "loss": 0.782, "step": 6826 }, { "epoch": 2.839872084860775, "grad_norm": 209.9096221923828, "learning_rate": 1.4916875321348556e-07, "loss": 0.6916, "step": 6827 }, { "epoch": 2.840288069053376, "grad_norm": 2.0885567665100098, "learning_rate": 1.4839877411902738e-07, "loss": 0.836, "step": 6828 }, { "epoch": 2.8407040532459766, "grad_norm": 1.8191211223602295, "learning_rate": 1.4763077255305346e-07, "loss": 0.7487, "step": 6829 }, { "epoch": 2.841120037438577, "grad_norm": 1.9587513208389282, "learning_rate": 1.4686474866972388e-07, "loss": 0.9406, "step": 6830 }, { "epoch": 2.8415360216311782, "grad_norm": 236.12290954589844, "learning_rate": 1.461007026228045e-07, "loss": 0.8564, "step": 6831 }, { "epoch": 2.841952005823779, "grad_norm": 101.62615966796875, "learning_rate": 1.4533863456566266e-07, "loss": 0.8485, "step": 6832 }, { "epoch": 2.8423679900163794, "grad_norm": 2.0183868408203125, "learning_rate": 1.4457854465127043e-07, "loss": 0.6587, "step": 6833 }, { "epoch": 2.84278397420898, "grad_norm": 1.7790472507476807, "learning_rate": 1.438204330322024e-07, "loss": 0.6759, "step": 6834 }, { "epoch": 2.8431999584015806, "grad_norm": 1.8699138164520264, "learning_rate": 1.4306429986063353e-07, "loss": 0.6932, "step": 6835 }, { "epoch": 2.843615942594181, "grad_norm": 1.8688488006591797, "learning_rate": 1.4231014528834686e-07, "loss": 0.6654, "step": 6836 }, { "epoch": 2.844031926786782, "grad_norm": 1.9223437309265137, "learning_rate": 1.415579694667224e-07, "loss": 0.7536, "step": 6837 }, { "epoch": 2.8444479109793828, "grad_norm": 24.26193618774414, "learning_rate": 1.4080777254674716e-07, "loss": 0.7446, "step": 6838 }, { "epoch": 2.8448638951719833, "grad_norm": 2.001272439956665, "learning_rate": 1.4005955467900733e-07, "loss": 0.7535, "step": 6839 }, { "epoch": 2.8452798793645844, "grad_norm": 2.0268120765686035, "learning_rate": 1.3931331601369724e-07, "loss": 0.8779, "step": 6840 }, { "epoch": 2.845695863557185, "grad_norm": 1.758525013923645, "learning_rate": 1.3856905670060815e-07, "loss": 0.841, "step": 6841 }, { "epoch": 2.8461118477497855, "grad_norm": 1.8766980171203613, "learning_rate": 1.3782677688913836e-07, "loss": 0.807, "step": 6842 }, { "epoch": 2.846527831942386, "grad_norm": 1.9250457286834717, "learning_rate": 1.3708647672828424e-07, "loss": 0.8831, "step": 6843 }, { "epoch": 2.8469438161349867, "grad_norm": 1.7817540168762207, "learning_rate": 1.363481563666502e-07, "loss": 0.7641, "step": 6844 }, { "epoch": 2.8473598003275873, "grad_norm": 1.8934847116470337, "learning_rate": 1.356118159524389e-07, "loss": 0.7707, "step": 6845 }, { "epoch": 2.8477757845201883, "grad_norm": 2.089667320251465, "learning_rate": 1.3487745563345865e-07, "loss": 0.8225, "step": 6846 }, { "epoch": 2.848191768712789, "grad_norm": 2.033811569213867, "learning_rate": 1.341450755571161e-07, "loss": 0.8583, "step": 6847 }, { "epoch": 2.8486077529053895, "grad_norm": 1.9573839902877808, "learning_rate": 1.334146758704258e-07, "loss": 0.7204, "step": 6848 }, { "epoch": 2.8490237370979905, "grad_norm": 1.9171867370605469, "learning_rate": 1.3268625672000157e-07, "loss": 0.7688, "step": 6849 }, { "epoch": 2.849439721290591, "grad_norm": 1.9427129030227661, "learning_rate": 1.3195981825205871e-07, "loss": 0.7386, "step": 6850 }, { "epoch": 2.8498557054831917, "grad_norm": 1.8490641117095947, "learning_rate": 1.3123536061241838e-07, "loss": 0.6743, "step": 6851 }, { "epoch": 2.8502716896757923, "grad_norm": 1.9442631006240845, "learning_rate": 1.3051288394650084e-07, "loss": 0.775, "step": 6852 }, { "epoch": 2.850687673868393, "grad_norm": 2.087425708770752, "learning_rate": 1.297923883993324e-07, "loss": 0.9401, "step": 6853 }, { "epoch": 2.8511036580609934, "grad_norm": 2.0541834831237793, "learning_rate": 1.2907387411553617e-07, "loss": 0.7891, "step": 6854 }, { "epoch": 2.8515196422535944, "grad_norm": 1.9044170379638672, "learning_rate": 1.283573412393424e-07, "loss": 0.9492, "step": 6855 }, { "epoch": 2.851935626446195, "grad_norm": 1.9219415187835693, "learning_rate": 1.2764278991458047e-07, "loss": 0.7819, "step": 6856 }, { "epoch": 2.8523516106387956, "grad_norm": 34.6617431640625, "learning_rate": 1.269302202846867e-07, "loss": 0.8124, "step": 6857 }, { "epoch": 2.8527675948313966, "grad_norm": 2.014854907989502, "learning_rate": 1.2621963249269232e-07, "loss": 0.7989, "step": 6858 }, { "epoch": 2.853183579023997, "grad_norm": 1.8756780624389648, "learning_rate": 1.255110266812376e-07, "loss": 0.8228, "step": 6859 }, { "epoch": 2.853599563216598, "grad_norm": 1.8956469297409058, "learning_rate": 1.2480440299256103e-07, "loss": 0.753, "step": 6860 }, { "epoch": 2.8540155474091984, "grad_norm": 1.9182302951812744, "learning_rate": 1.240997615685058e-07, "loss": 0.861, "step": 6861 }, { "epoch": 2.854431531601799, "grad_norm": 1.8888517618179321, "learning_rate": 1.233971025505143e-07, "loss": 0.7691, "step": 6862 }, { "epoch": 2.8548475157943995, "grad_norm": 1.8712420463562012, "learning_rate": 1.2269642607963262e-07, "loss": 0.693, "step": 6863 }, { "epoch": 2.8552634999870006, "grad_norm": 1.7331515550613403, "learning_rate": 1.2199773229650935e-07, "loss": 0.7386, "step": 6864 }, { "epoch": 2.855679484179601, "grad_norm": 1.909829020500183, "learning_rate": 1.2130102134139343e-07, "loss": 0.8354, "step": 6865 }, { "epoch": 2.8560954683722017, "grad_norm": 1.8659961223602295, "learning_rate": 1.2060629335413743e-07, "loss": 0.7951, "step": 6866 }, { "epoch": 2.8565114525648028, "grad_norm": 2.0029637813568115, "learning_rate": 1.1991354847419533e-07, "loss": 0.802, "step": 6867 }, { "epoch": 2.8569274367574033, "grad_norm": 2.0231595039367676, "learning_rate": 1.192227868406237e-07, "loss": 0.8982, "step": 6868 }, { "epoch": 2.857343420950004, "grad_norm": 1.9096735715866089, "learning_rate": 1.185340085920783e-07, "loss": 0.8663, "step": 6869 }, { "epoch": 2.8577594051426045, "grad_norm": 1.950717806816101, "learning_rate": 1.1784721386682074e-07, "loss": 0.7427, "step": 6870 }, { "epoch": 2.858175389335205, "grad_norm": 1.8516849279403687, "learning_rate": 1.1716240280271185e-07, "loss": 0.7926, "step": 6871 }, { "epoch": 2.8585913735278057, "grad_norm": 1.9166045188903809, "learning_rate": 1.16479575537215e-07, "loss": 0.693, "step": 6872 }, { "epoch": 2.8590073577204067, "grad_norm": 1.9780750274658203, "learning_rate": 1.1579873220739501e-07, "loss": 0.6888, "step": 6873 }, { "epoch": 2.8594233419130073, "grad_norm": 1.860369324684143, "learning_rate": 1.1511987294991811e-07, "loss": 0.7553, "step": 6874 }, { "epoch": 2.859839326105608, "grad_norm": 2.059720039367676, "learning_rate": 1.1444299790105417e-07, "loss": 0.7826, "step": 6875 }, { "epoch": 2.860255310298209, "grad_norm": 1.811523675918579, "learning_rate": 1.1376810719667231e-07, "loss": 0.8543, "step": 6876 }, { "epoch": 2.8606712944908095, "grad_norm": 1.9201321601867676, "learning_rate": 1.1309520097224413e-07, "loss": 0.7766, "step": 6877 }, { "epoch": 2.86108727868341, "grad_norm": 80.6410140991211, "learning_rate": 1.1242427936284494e-07, "loss": 0.7343, "step": 6878 }, { "epoch": 2.8615032628760106, "grad_norm": 1.806671380996704, "learning_rate": 1.1175534250314924e-07, "loss": 0.8836, "step": 6879 }, { "epoch": 2.8619192470686112, "grad_norm": 1.874318242073059, "learning_rate": 1.1108839052743292e-07, "loss": 0.7902, "step": 6880 }, { "epoch": 2.862335231261212, "grad_norm": 1.932863712310791, "learning_rate": 1.1042342356957559e-07, "loss": 0.7643, "step": 6881 }, { "epoch": 2.862751215453813, "grad_norm": 1.9694740772247314, "learning_rate": 1.0976044176305712e-07, "loss": 0.7687, "step": 6882 }, { "epoch": 2.8631671996464134, "grad_norm": 1.9096887111663818, "learning_rate": 1.0909944524095884e-07, "loss": 0.8241, "step": 6883 }, { "epoch": 2.863583183839014, "grad_norm": 1.9013112783432007, "learning_rate": 1.0844043413596239e-07, "loss": 0.6931, "step": 6884 }, { "epoch": 2.863999168031615, "grad_norm": 1.9481310844421387, "learning_rate": 1.0778340858035418e-07, "loss": 0.7893, "step": 6885 }, { "epoch": 2.8644151522242156, "grad_norm": 1.9538249969482422, "learning_rate": 1.0712836870601762e-07, "loss": 0.7787, "step": 6886 }, { "epoch": 2.864831136416816, "grad_norm": 1.9353452920913696, "learning_rate": 1.0647531464444305e-07, "loss": 0.8575, "step": 6887 }, { "epoch": 2.8652471206094168, "grad_norm": 1.9948365688323975, "learning_rate": 1.0582424652671563e-07, "loss": 0.8222, "step": 6888 }, { "epoch": 2.8656631048020174, "grad_norm": 1.899876356124878, "learning_rate": 1.051751644835286e-07, "loss": 0.9317, "step": 6889 }, { "epoch": 2.866079088994618, "grad_norm": 1.9374159574508667, "learning_rate": 1.0452806864517217e-07, "loss": 0.845, "step": 6890 }, { "epoch": 2.866495073187219, "grad_norm": 2.03542423248291, "learning_rate": 1.03882959141538e-07, "loss": 0.8007, "step": 6891 }, { "epoch": 2.8669110573798195, "grad_norm": 229.34725952148438, "learning_rate": 1.032398361021203e-07, "loss": 0.7772, "step": 6892 }, { "epoch": 2.86732704157242, "grad_norm": 1.9272576570510864, "learning_rate": 1.0259869965601577e-07, "loss": 0.7992, "step": 6893 }, { "epoch": 2.867743025765021, "grad_norm": 1.8357555866241455, "learning_rate": 1.0195954993191925e-07, "loss": 0.7308, "step": 6894 }, { "epoch": 2.8681590099576217, "grad_norm": 2.0152978897094727, "learning_rate": 1.0132238705812814e-07, "loss": 0.9206, "step": 6895 }, { "epoch": 2.8685749941502223, "grad_norm": 1.9724583625793457, "learning_rate": 1.0068721116254232e-07, "loss": 0.8375, "step": 6896 }, { "epoch": 2.868990978342823, "grad_norm": 1.9412329196929932, "learning_rate": 1.0005402237266092e-07, "loss": 0.7932, "step": 6897 }, { "epoch": 2.8694069625354235, "grad_norm": 4.940816402435303, "learning_rate": 9.942282081558563e-08, "loss": 0.8124, "step": 6898 }, { "epoch": 2.869822946728024, "grad_norm": 1.906668782234192, "learning_rate": 9.87936066180184e-08, "loss": 0.7722, "step": 6899 }, { "epoch": 2.870238930920625, "grad_norm": 1.8489049673080444, "learning_rate": 9.816637990626266e-08, "loss": 0.7272, "step": 6900 }, { "epoch": 2.8706549151132257, "grad_norm": 1.8744843006134033, "learning_rate": 9.754114080622101e-08, "loss": 0.7849, "step": 6901 }, { "epoch": 2.8710708993058263, "grad_norm": 2.295531749725342, "learning_rate": 9.691788944340196e-08, "loss": 0.8331, "step": 6902 }, { "epoch": 2.8714868834984273, "grad_norm": 1.8655146360397339, "learning_rate": 9.629662594290879e-08, "loss": 0.6903, "step": 6903 }, { "epoch": 2.871902867691028, "grad_norm": 1.8426597118377686, "learning_rate": 9.567735042945059e-08, "loss": 0.8248, "step": 6904 }, { "epoch": 2.8723188518836285, "grad_norm": 1.9332239627838135, "learning_rate": 9.506006302733461e-08, "loss": 0.6839, "step": 6905 }, { "epoch": 2.872734836076229, "grad_norm": 1.7969180345535278, "learning_rate": 9.444476386046952e-08, "loss": 0.7607, "step": 6906 }, { "epoch": 2.8731508202688296, "grad_norm": 1.9785723686218262, "learning_rate": 9.383145305236652e-08, "loss": 0.8821, "step": 6907 }, { "epoch": 2.87356680446143, "grad_norm": 1.8822060823440552, "learning_rate": 9.322013072613712e-08, "loss": 0.8003, "step": 6908 }, { "epoch": 2.8739827886540312, "grad_norm": 2.0231103897094727, "learning_rate": 9.261079700449094e-08, "loss": 0.784, "step": 6909 }, { "epoch": 2.874398772846632, "grad_norm": 1.8781678676605225, "learning_rate": 9.200345200974125e-08, "loss": 0.8638, "step": 6910 }, { "epoch": 2.8748147570392324, "grad_norm": 1.9875956773757935, "learning_rate": 9.139809586380277e-08, "loss": 0.7623, "step": 6911 }, { "epoch": 2.8752307412318334, "grad_norm": 1.9359033107757568, "learning_rate": 9.07947286881894e-08, "loss": 0.6549, "step": 6912 }, { "epoch": 2.875646725424434, "grad_norm": 1.9569231271743774, "learning_rate": 9.019335060401535e-08, "loss": 0.9065, "step": 6913 }, { "epoch": 2.8760627096170346, "grad_norm": 1.7673025131225586, "learning_rate": 8.95939617319952e-08, "loss": 0.6686, "step": 6914 }, { "epoch": 2.876478693809635, "grad_norm": 2.0543465614318848, "learning_rate": 8.899656219244713e-08, "loss": 0.824, "step": 6915 }, { "epoch": 2.8768946780022358, "grad_norm": 1.909401535987854, "learning_rate": 8.840115210528632e-08, "loss": 0.9478, "step": 6916 }, { "epoch": 2.8773106621948363, "grad_norm": 1.8907819986343384, "learning_rate": 8.780773159003275e-08, "loss": 0.8392, "step": 6917 }, { "epoch": 2.8777266463874374, "grad_norm": 1.8653790950775146, "learning_rate": 8.721630076580334e-08, "loss": 0.7817, "step": 6918 }, { "epoch": 2.878142630580038, "grad_norm": 1.7168102264404297, "learning_rate": 8.662685975131757e-08, "loss": 0.7153, "step": 6919 }, { "epoch": 2.8785586147726385, "grad_norm": 1.896498680114746, "learning_rate": 8.603940866489302e-08, "loss": 0.77, "step": 6920 }, { "epoch": 2.8789745989652396, "grad_norm": 1.8782238960266113, "learning_rate": 8.545394762445314e-08, "loss": 0.764, "step": 6921 }, { "epoch": 2.87939058315784, "grad_norm": 1.9242100715637207, "learning_rate": 8.487047674751503e-08, "loss": 0.8156, "step": 6922 }, { "epoch": 2.8798065673504407, "grad_norm": 1.7959057092666626, "learning_rate": 8.428899615120167e-08, "loss": 0.7492, "step": 6923 }, { "epoch": 2.8802225515430413, "grad_norm": 1.8273475170135498, "learning_rate": 8.370950595223526e-08, "loss": 0.7517, "step": 6924 }, { "epoch": 2.880638535735642, "grad_norm": 1.9267003536224365, "learning_rate": 8.313200626693607e-08, "loss": 0.6661, "step": 6925 }, { "epoch": 2.8810545199282425, "grad_norm": 303.20648193359375, "learning_rate": 8.255649721122805e-08, "loss": 0.8108, "step": 6926 }, { "epoch": 2.8814705041208435, "grad_norm": 1.9761170148849487, "learning_rate": 8.198297890063323e-08, "loss": 0.8325, "step": 6927 }, { "epoch": 2.881886488313444, "grad_norm": 7.052860260009766, "learning_rate": 8.141145145027507e-08, "loss": 0.7856, "step": 6928 }, { "epoch": 2.8823024725060447, "grad_norm": 1.9354592561721802, "learning_rate": 8.084191497487737e-08, "loss": 0.7566, "step": 6929 }, { "epoch": 2.8827184566986457, "grad_norm": 1.9049640893936157, "learning_rate": 8.027436958876422e-08, "loss": 0.7541, "step": 6930 }, { "epoch": 2.8831344408912463, "grad_norm": 1.841255784034729, "learning_rate": 7.970881540585895e-08, "loss": 0.7419, "step": 6931 }, { "epoch": 2.883550425083847, "grad_norm": 1.9596120119094849, "learning_rate": 7.914525253968741e-08, "loss": 0.7339, "step": 6932 }, { "epoch": 2.8839664092764474, "grad_norm": 1.7946420907974243, "learning_rate": 7.858368110337577e-08, "loss": 0.8421, "step": 6933 }, { "epoch": 2.884382393469048, "grad_norm": 2.23848557472229, "learning_rate": 7.80241012096461e-08, "loss": 0.8182, "step": 6934 }, { "epoch": 2.8847983776616486, "grad_norm": 1.9192153215408325, "learning_rate": 7.746651297082519e-08, "loss": 0.8009, "step": 6935 }, { "epoch": 2.8852143618542496, "grad_norm": 1.9653041362762451, "learning_rate": 7.69109164988402e-08, "loss": 0.7716, "step": 6936 }, { "epoch": 2.88563034604685, "grad_norm": 2.008636713027954, "learning_rate": 7.635731190521412e-08, "loss": 0.7474, "step": 6937 }, { "epoch": 2.886046330239451, "grad_norm": 1.7711459398269653, "learning_rate": 7.580569930107473e-08, "loss": 0.7869, "step": 6938 }, { "epoch": 2.886462314432052, "grad_norm": 2.066328763961792, "learning_rate": 7.525607879714791e-08, "loss": 0.9035, "step": 6939 }, { "epoch": 2.8868782986246524, "grad_norm": 1.994760274887085, "learning_rate": 7.470845050375985e-08, "loss": 0.78, "step": 6940 }, { "epoch": 2.887294282817253, "grad_norm": 1.8669267892837524, "learning_rate": 7.416281453083707e-08, "loss": 0.7955, "step": 6941 }, { "epoch": 2.8877102670098536, "grad_norm": 1.9314709901809692, "learning_rate": 7.361917098790639e-08, "loss": 0.8339, "step": 6942 }, { "epoch": 2.888126251202454, "grad_norm": 1.9235764741897583, "learning_rate": 7.307751998409274e-08, "loss": 0.8148, "step": 6943 }, { "epoch": 2.8885422353950547, "grad_norm": 1.998321294784546, "learning_rate": 7.253786162812359e-08, "loss": 0.7883, "step": 6944 }, { "epoch": 2.8889582195876558, "grad_norm": 2.982132911682129, "learning_rate": 7.200019602832565e-08, "loss": 0.759, "step": 6945 }, { "epoch": 2.8893742037802563, "grad_norm": 2.1232035160064697, "learning_rate": 7.146452329262477e-08, "loss": 0.7754, "step": 6946 }, { "epoch": 2.889790187972857, "grad_norm": 2.2861413955688477, "learning_rate": 7.09308435285494e-08, "loss": 0.7546, "step": 6947 }, { "epoch": 2.890206172165458, "grad_norm": 1.799254298210144, "learning_rate": 7.039915684322273e-08, "loss": 0.7204, "step": 6948 }, { "epoch": 2.8906221563580585, "grad_norm": 1.7649074792861938, "learning_rate": 6.986946334337386e-08, "loss": 0.7888, "step": 6949 }, { "epoch": 2.891038140550659, "grad_norm": 2.0212817192077637, "learning_rate": 6.93417631353277e-08, "loss": 0.8589, "step": 6950 }, { "epoch": 2.8914541247432597, "grad_norm": 2.0413007736206055, "learning_rate": 6.881605632501065e-08, "loss": 0.8697, "step": 6951 }, { "epoch": 2.8918701089358603, "grad_norm": 1.948422908782959, "learning_rate": 6.829234301794941e-08, "loss": 0.8145, "step": 6952 }, { "epoch": 2.892286093128461, "grad_norm": 1.8770233392715454, "learning_rate": 6.777062331926875e-08, "loss": 0.7707, "step": 6953 }, { "epoch": 2.892702077321062, "grad_norm": 1.9338862895965576, "learning_rate": 6.725089733369605e-08, "loss": 0.7227, "step": 6954 }, { "epoch": 2.8931180615136625, "grad_norm": 2.0246918201446533, "learning_rate": 6.67331651655545e-08, "loss": 0.6758, "step": 6955 }, { "epoch": 2.893534045706263, "grad_norm": 1.8565047979354858, "learning_rate": 6.621742691877097e-08, "loss": 0.7911, "step": 6956 }, { "epoch": 2.893950029898864, "grad_norm": 1.904039740562439, "learning_rate": 6.570368269686933e-08, "loss": 0.7687, "step": 6957 }, { "epoch": 2.8943660140914647, "grad_norm": 151.79891967773438, "learning_rate": 6.519193260297485e-08, "loss": 0.9413, "step": 6958 }, { "epoch": 2.8947819982840652, "grad_norm": 2.0646557807922363, "learning_rate": 6.468217673981204e-08, "loss": 0.7692, "step": 6959 }, { "epoch": 2.895197982476666, "grad_norm": 1.8697233200073242, "learning_rate": 6.417441520970569e-08, "loss": 0.7413, "step": 6960 }, { "epoch": 2.8956139666692664, "grad_norm": 39.08182144165039, "learning_rate": 6.366864811457874e-08, "loss": 0.7524, "step": 6961 }, { "epoch": 2.896029950861867, "grad_norm": 1.947885274887085, "learning_rate": 6.316487555595552e-08, "loss": 0.8495, "step": 6962 }, { "epoch": 2.896445935054468, "grad_norm": 52.66129684448242, "learning_rate": 6.266309763495848e-08, "loss": 0.8177, "step": 6963 }, { "epoch": 2.8968619192470686, "grad_norm": 1.9687799215316772, "learning_rate": 6.216331445231039e-08, "loss": 0.7712, "step": 6964 }, { "epoch": 2.897277903439669, "grad_norm": 2.0322930812835693, "learning_rate": 6.16655261083332e-08, "loss": 0.7591, "step": 6965 }, { "epoch": 2.89769388763227, "grad_norm": 1.9566287994384766, "learning_rate": 6.116973270294923e-08, "loss": 0.7038, "step": 6966 }, { "epoch": 2.898109871824871, "grad_norm": 1.7783877849578857, "learning_rate": 6.067593433567997e-08, "loss": 0.787, "step": 6967 }, { "epoch": 2.8985258560174714, "grad_norm": 1.8920575380325317, "learning_rate": 6.018413110564613e-08, "loss": 0.7669, "step": 6968 }, { "epoch": 2.898941840210072, "grad_norm": 1.9534921646118164, "learning_rate": 5.969432311156876e-08, "loss": 0.7681, "step": 6969 }, { "epoch": 2.8993578244026725, "grad_norm": 1.9435057640075684, "learning_rate": 5.920651045176695e-08, "loss": 0.8167, "step": 6970 }, { "epoch": 2.899773808595273, "grad_norm": 1.8331193923950195, "learning_rate": 5.87206932241613e-08, "loss": 0.8474, "step": 6971 }, { "epoch": 2.900189792787874, "grad_norm": 1.7668644189834595, "learning_rate": 5.8236871526270447e-08, "loss": 0.825, "step": 6972 }, { "epoch": 2.9006057769804747, "grad_norm": 1.9070719480514526, "learning_rate": 5.775504545521227e-08, "loss": 0.8135, "step": 6973 }, { "epoch": 2.9010217611730753, "grad_norm": 1.9471296072006226, "learning_rate": 5.727521510770384e-08, "loss": 0.7477, "step": 6974 }, { "epoch": 2.9014377453656763, "grad_norm": 1.8226124048233032, "learning_rate": 5.679738058006368e-08, "loss": 0.8233, "step": 6975 }, { "epoch": 2.901853729558277, "grad_norm": 1.9668866395950317, "learning_rate": 5.632154196820727e-08, "loss": 0.8358, "step": 6976 }, { "epoch": 2.9022697137508775, "grad_norm": 44.598201751708984, "learning_rate": 5.5847699367652664e-08, "loss": 0.8575, "step": 6977 }, { "epoch": 2.902685697943478, "grad_norm": 1.6981960535049438, "learning_rate": 5.5375852873511546e-08, "loss": 0.749, "step": 6978 }, { "epoch": 2.9031016821360787, "grad_norm": 7.013233184814453, "learning_rate": 5.490600258050149e-08, "loss": 0.7865, "step": 6979 }, { "epoch": 2.9035176663286792, "grad_norm": 1.9040501117706299, "learning_rate": 5.443814858293373e-08, "loss": 0.7972, "step": 6980 }, { "epoch": 2.9039336505212803, "grad_norm": 2.0506272315979004, "learning_rate": 5.397229097472423e-08, "loss": 0.7264, "step": 6981 }, { "epoch": 2.904349634713881, "grad_norm": 1.9662340879440308, "learning_rate": 5.350842984938376e-08, "loss": 0.8917, "step": 6982 }, { "epoch": 2.9047656189064814, "grad_norm": 1.9433006048202515, "learning_rate": 5.30465653000245e-08, "loss": 0.8244, "step": 6983 }, { "epoch": 2.9051816030990825, "grad_norm": 1.6584559679031372, "learning_rate": 5.258669741935563e-08, "loss": 0.6441, "step": 6984 }, { "epoch": 2.905597587291683, "grad_norm": 1.8091959953308105, "learning_rate": 5.2128826299688853e-08, "loss": 0.7182, "step": 6985 }, { "epoch": 2.9060135714842836, "grad_norm": 1.867441177368164, "learning_rate": 5.1672952032933984e-08, "loss": 0.7808, "step": 6986 }, { "epoch": 2.906429555676884, "grad_norm": 1.7930117845535278, "learning_rate": 5.121907471059784e-08, "loss": 0.712, "step": 6987 }, { "epoch": 2.906845539869485, "grad_norm": 2.0790977478027344, "learning_rate": 5.076719442378863e-08, "loss": 0.8646, "step": 6988 }, { "epoch": 2.9072615240620854, "grad_norm": 1.9904537200927734, "learning_rate": 5.031731126321382e-08, "loss": 0.7942, "step": 6989 }, { "epoch": 2.9076775082546864, "grad_norm": 1.892932415008545, "learning_rate": 4.9869425319177825e-08, "loss": 0.7721, "step": 6990 }, { "epoch": 2.908093492447287, "grad_norm": 1.879514217376709, "learning_rate": 4.9423536681586506e-08, "loss": 0.6881, "step": 6991 }, { "epoch": 2.9085094766398876, "grad_norm": 240.2626953125, "learning_rate": 4.8979645439943826e-08, "loss": 0.8099, "step": 6992 }, { "epoch": 2.9089254608324886, "grad_norm": 1.8994090557098389, "learning_rate": 4.853775168335184e-08, "loss": 0.741, "step": 6993 }, { "epoch": 2.909341445025089, "grad_norm": 78.83052825927734, "learning_rate": 4.8097855500512935e-08, "loss": 0.8045, "step": 6994 }, { "epoch": 2.9097574292176898, "grad_norm": 1.856120228767395, "learning_rate": 4.765995697972981e-08, "loss": 0.7758, "step": 6995 }, { "epoch": 2.9101734134102903, "grad_norm": 141.4324493408203, "learning_rate": 4.7224056208899915e-08, "loss": 0.7752, "step": 6996 }, { "epoch": 2.910589397602891, "grad_norm": 1.7501033544540405, "learning_rate": 4.679015327552439e-08, "loss": 0.7525, "step": 6997 }, { "epoch": 2.9110053817954915, "grad_norm": 2.0245988368988037, "learning_rate": 4.6358248266701324e-08, "loss": 0.782, "step": 6998 }, { "epoch": 2.9114213659880925, "grad_norm": 1.9301327466964722, "learning_rate": 4.592834126912693e-08, "loss": 0.8161, "step": 6999 }, { "epoch": 2.911837350180693, "grad_norm": 2.0891759395599365, "learning_rate": 4.550043236909663e-08, "loss": 0.7047, "step": 7000 }, { "epoch": 2.911837350180693, "eval_loss": 0.7519673705101013, "eval_runtime": 1876.2451, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.757, "step": 7000 } ], "logging_steps": 1, "max_steps": 7212, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.786937654858951e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }