{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2845, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017574692442882249, "grad_norm": 6.209695924041015, "learning_rate": 2.8070175438596494e-07, "loss": 1.0224, "step": 1 }, { "epoch": 0.0035149384885764497, "grad_norm": 6.167651461515038, "learning_rate": 5.614035087719299e-07, "loss": 1.0387, "step": 2 }, { "epoch": 0.005272407732864675, "grad_norm": 6.274810973686463, "learning_rate": 8.421052631578948e-07, "loss": 1.0427, "step": 3 }, { "epoch": 0.007029876977152899, "grad_norm": 6.169714179899507, "learning_rate": 1.1228070175438598e-06, "loss": 1.0487, "step": 4 }, { "epoch": 0.008787346221441126, "grad_norm": 5.979446650377619, "learning_rate": 1.4035087719298246e-06, "loss": 1.0207, "step": 5 }, { "epoch": 0.01054481546572935, "grad_norm": 5.625163000414991, "learning_rate": 1.6842105263157895e-06, "loss": 1.0151, "step": 6 }, { "epoch": 0.012302284710017574, "grad_norm": 4.528382262139684, "learning_rate": 1.9649122807017546e-06, "loss": 0.9932, "step": 7 }, { "epoch": 0.014059753954305799, "grad_norm": 4.145765552800399, "learning_rate": 2.2456140350877195e-06, "loss": 0.98, "step": 8 }, { "epoch": 0.015817223198594025, "grad_norm": 2.190403069399786, "learning_rate": 2.5263157894736844e-06, "loss": 0.9501, "step": 9 }, { "epoch": 0.01757469244288225, "grad_norm": 2.101147582798379, "learning_rate": 2.8070175438596493e-06, "loss": 0.9442, "step": 10 }, { "epoch": 0.019332161687170474, "grad_norm": 1.836847445913435, "learning_rate": 3.0877192982456146e-06, "loss": 0.931, "step": 11 }, { "epoch": 0.0210896309314587, "grad_norm": 2.2458782245048527, "learning_rate": 3.368421052631579e-06, "loss": 0.9131, "step": 12 }, { "epoch": 0.022847100175746926, "grad_norm": 3.8196627302065056, "learning_rate": 3.6491228070175443e-06, "loss": 0.919, "step": 13 }, { "epoch": 0.02460456942003515, "grad_norm": 3.9181941676173206, "learning_rate": 3.929824561403509e-06, "loss": 0.9235, "step": 14 }, { "epoch": 0.026362038664323375, "grad_norm": 3.709355182723551, "learning_rate": 4.210526315789474e-06, "loss": 0.9199, "step": 15 }, { "epoch": 0.028119507908611598, "grad_norm": 3.39204973937961, "learning_rate": 4.491228070175439e-06, "loss": 0.8891, "step": 16 }, { "epoch": 0.029876977152899824, "grad_norm": 2.4033605998337597, "learning_rate": 4.771929824561404e-06, "loss": 0.8703, "step": 17 }, { "epoch": 0.03163444639718805, "grad_norm": 2.201057749344087, "learning_rate": 5.052631578947369e-06, "loss": 0.8666, "step": 18 }, { "epoch": 0.033391915641476276, "grad_norm": 1.7592524143463706, "learning_rate": 5.333333333333334e-06, "loss": 0.8514, "step": 19 }, { "epoch": 0.0351493848857645, "grad_norm": 1.2995166123525106, "learning_rate": 5.6140350877192985e-06, "loss": 0.8237, "step": 20 }, { "epoch": 0.03690685413005272, "grad_norm": 1.2323264379097971, "learning_rate": 5.8947368421052634e-06, "loss": 0.8302, "step": 21 }, { "epoch": 0.03866432337434095, "grad_norm": 1.1943363162060407, "learning_rate": 6.175438596491229e-06, "loss": 0.8205, "step": 22 }, { "epoch": 0.040421792618629174, "grad_norm": 1.1483058764970768, "learning_rate": 6.456140350877193e-06, "loss": 0.8103, "step": 23 }, { "epoch": 0.0421792618629174, "grad_norm": 1.0289005733395333, "learning_rate": 6.736842105263158e-06, "loss": 0.8013, "step": 24 }, { "epoch": 0.043936731107205626, "grad_norm": 1.0263725061264142, "learning_rate": 7.017543859649123e-06, "loss": 0.795, "step": 25 }, { "epoch": 0.04569420035149385, "grad_norm": 1.0181382868831441, "learning_rate": 7.298245614035089e-06, "loss": 0.7948, "step": 26 }, { "epoch": 0.04745166959578207, "grad_norm": 0.8066445235520664, "learning_rate": 7.578947368421054e-06, "loss": 0.7739, "step": 27 }, { "epoch": 0.0492091388400703, "grad_norm": 0.863700972340358, "learning_rate": 7.859649122807018e-06, "loss": 0.772, "step": 28 }, { "epoch": 0.050966608084358524, "grad_norm": 0.8760639636792864, "learning_rate": 8.140350877192983e-06, "loss": 0.7827, "step": 29 }, { "epoch": 0.05272407732864675, "grad_norm": 0.7703021544990425, "learning_rate": 8.421052631578948e-06, "loss": 0.766, "step": 30 }, { "epoch": 0.054481546572934976, "grad_norm": 0.6390803291040503, "learning_rate": 8.701754385964914e-06, "loss": 0.7611, "step": 31 }, { "epoch": 0.056239015817223195, "grad_norm": 0.7390032351043727, "learning_rate": 8.982456140350878e-06, "loss": 0.7693, "step": 32 }, { "epoch": 0.05799648506151142, "grad_norm": 0.6865016605420768, "learning_rate": 9.263157894736842e-06, "loss": 0.7418, "step": 33 }, { "epoch": 0.05975395430579965, "grad_norm": 0.5555923621365076, "learning_rate": 9.543859649122808e-06, "loss": 0.759, "step": 34 }, { "epoch": 0.061511423550087874, "grad_norm": 0.5308234850547482, "learning_rate": 9.824561403508772e-06, "loss": 0.7491, "step": 35 }, { "epoch": 0.0632688927943761, "grad_norm": 0.5585273999914359, "learning_rate": 1.0105263157894738e-05, "loss": 0.7495, "step": 36 }, { "epoch": 0.06502636203866433, "grad_norm": 0.6499711768672977, "learning_rate": 1.0385964912280702e-05, "loss": 0.7483, "step": 37 }, { "epoch": 0.06678383128295255, "grad_norm": 0.5123575916775978, "learning_rate": 1.0666666666666667e-05, "loss": 0.7297, "step": 38 }, { "epoch": 0.06854130052724078, "grad_norm": 0.5200227435900662, "learning_rate": 1.0947368421052633e-05, "loss": 0.7377, "step": 39 }, { "epoch": 0.070298769771529, "grad_norm": 0.5341229211420955, "learning_rate": 1.1228070175438597e-05, "loss": 0.7216, "step": 40 }, { "epoch": 0.07205623901581722, "grad_norm": 0.4276384066933864, "learning_rate": 1.1508771929824563e-05, "loss": 0.7248, "step": 41 }, { "epoch": 0.07381370826010544, "grad_norm": 0.41223134442566367, "learning_rate": 1.1789473684210527e-05, "loss": 0.7227, "step": 42 }, { "epoch": 0.07557117750439367, "grad_norm": 0.38388356985204003, "learning_rate": 1.2070175438596493e-05, "loss": 0.723, "step": 43 }, { "epoch": 0.0773286467486819, "grad_norm": 0.34550586086227697, "learning_rate": 1.2350877192982458e-05, "loss": 0.7386, "step": 44 }, { "epoch": 0.07908611599297012, "grad_norm": 0.3386055223267995, "learning_rate": 1.263157894736842e-05, "loss": 0.7177, "step": 45 }, { "epoch": 0.08084358523725835, "grad_norm": 0.35668288776966045, "learning_rate": 1.2912280701754386e-05, "loss": 0.7056, "step": 46 }, { "epoch": 0.08260105448154657, "grad_norm": 0.33083949324281775, "learning_rate": 1.3192982456140354e-05, "loss": 0.7058, "step": 47 }, { "epoch": 0.0843585237258348, "grad_norm": 0.3195509510608797, "learning_rate": 1.3473684210526316e-05, "loss": 0.6977, "step": 48 }, { "epoch": 0.08611599297012303, "grad_norm": 0.32639798014775656, "learning_rate": 1.3754385964912282e-05, "loss": 0.6995, "step": 49 }, { "epoch": 0.08787346221441125, "grad_norm": 0.3047966930487889, "learning_rate": 1.4035087719298246e-05, "loss": 0.7101, "step": 50 }, { "epoch": 0.08963093145869948, "grad_norm": 0.3472691515316833, "learning_rate": 1.4315789473684212e-05, "loss": 0.7049, "step": 51 }, { "epoch": 0.0913884007029877, "grad_norm": 0.30367958167853654, "learning_rate": 1.4596491228070177e-05, "loss": 0.6863, "step": 52 }, { "epoch": 0.09314586994727592, "grad_norm": 0.3237102711622167, "learning_rate": 1.4877192982456141e-05, "loss": 0.6812, "step": 53 }, { "epoch": 0.09490333919156414, "grad_norm": 0.3087856576674247, "learning_rate": 1.5157894736842107e-05, "loss": 0.6972, "step": 54 }, { "epoch": 0.09666080843585237, "grad_norm": 0.3237935339226419, "learning_rate": 1.543859649122807e-05, "loss": 0.69, "step": 55 }, { "epoch": 0.0984182776801406, "grad_norm": 0.3979386946313198, "learning_rate": 1.5719298245614037e-05, "loss": 0.6853, "step": 56 }, { "epoch": 0.10017574692442882, "grad_norm": 0.349624377797256, "learning_rate": 1.6000000000000003e-05, "loss": 0.6931, "step": 57 }, { "epoch": 0.10193321616871705, "grad_norm": 0.29856919592800374, "learning_rate": 1.6280701754385965e-05, "loss": 0.6803, "step": 58 }, { "epoch": 0.10369068541300527, "grad_norm": 0.3618623972790648, "learning_rate": 1.656140350877193e-05, "loss": 0.6741, "step": 59 }, { "epoch": 0.1054481546572935, "grad_norm": 0.35818975268364855, "learning_rate": 1.6842105263157896e-05, "loss": 0.6781, "step": 60 }, { "epoch": 0.10720562390158173, "grad_norm": 0.3875179494949592, "learning_rate": 1.7122807017543862e-05, "loss": 0.6773, "step": 61 }, { "epoch": 0.10896309314586995, "grad_norm": 0.565040635348508, "learning_rate": 1.7403508771929828e-05, "loss": 0.686, "step": 62 }, { "epoch": 0.11072056239015818, "grad_norm": 0.8779953283082393, "learning_rate": 1.768421052631579e-05, "loss": 0.6944, "step": 63 }, { "epoch": 0.11247803163444639, "grad_norm": 1.4162966873204468, "learning_rate": 1.7964912280701756e-05, "loss": 0.683, "step": 64 }, { "epoch": 0.11423550087873462, "grad_norm": 0.6647699899103071, "learning_rate": 1.824561403508772e-05, "loss": 0.6695, "step": 65 }, { "epoch": 0.11599297012302284, "grad_norm": 0.44995071155092814, "learning_rate": 1.8526315789473684e-05, "loss": 0.6755, "step": 66 }, { "epoch": 0.11775043936731107, "grad_norm": 0.9922016120613779, "learning_rate": 1.880701754385965e-05, "loss": 0.6755, "step": 67 }, { "epoch": 0.1195079086115993, "grad_norm": 1.237816595249441, "learning_rate": 1.9087719298245616e-05, "loss": 0.6753, "step": 68 }, { "epoch": 0.12126537785588752, "grad_norm": 0.7846288945095098, "learning_rate": 1.936842105263158e-05, "loss": 0.668, "step": 69 }, { "epoch": 0.12302284710017575, "grad_norm": 1.278824065374317, "learning_rate": 1.9649122807017544e-05, "loss": 0.6684, "step": 70 }, { "epoch": 0.12478031634446397, "grad_norm": 0.6723090784947089, "learning_rate": 1.992982456140351e-05, "loss": 0.6721, "step": 71 }, { "epoch": 0.1265377855887522, "grad_norm": 0.7450139900517838, "learning_rate": 2.0210526315789475e-05, "loss": 0.6733, "step": 72 }, { "epoch": 0.1282952548330404, "grad_norm": 0.8635465542188946, "learning_rate": 2.049122807017544e-05, "loss": 0.6527, "step": 73 }, { "epoch": 0.13005272407732865, "grad_norm": 1.3202055003456785, "learning_rate": 2.0771929824561403e-05, "loss": 0.6777, "step": 74 }, { "epoch": 0.13181019332161686, "grad_norm": 0.7342566684295168, "learning_rate": 2.105263157894737e-05, "loss": 0.6582, "step": 75 }, { "epoch": 0.1335676625659051, "grad_norm": 0.9488866985089394, "learning_rate": 2.1333333333333335e-05, "loss": 0.6606, "step": 76 }, { "epoch": 0.13532513181019332, "grad_norm": 1.0937031213298525, "learning_rate": 2.16140350877193e-05, "loss": 0.6747, "step": 77 }, { "epoch": 0.13708260105448156, "grad_norm": 1.0310012639484436, "learning_rate": 2.1894736842105266e-05, "loss": 0.6772, "step": 78 }, { "epoch": 0.13884007029876977, "grad_norm": 0.9526941542259143, "learning_rate": 2.217543859649123e-05, "loss": 0.6597, "step": 79 }, { "epoch": 0.140597539543058, "grad_norm": 1.2962656491732856, "learning_rate": 2.2456140350877194e-05, "loss": 0.6564, "step": 80 }, { "epoch": 0.14235500878734622, "grad_norm": 0.6976881220316061, "learning_rate": 2.273684210526316e-05, "loss": 0.6453, "step": 81 }, { "epoch": 0.14411247803163443, "grad_norm": 1.337775648913974, "learning_rate": 2.3017543859649126e-05, "loss": 0.655, "step": 82 }, { "epoch": 0.14586994727592267, "grad_norm": 0.8542954721186394, "learning_rate": 2.329824561403509e-05, "loss": 0.6388, "step": 83 }, { "epoch": 0.14762741652021089, "grad_norm": 1.2978061435335844, "learning_rate": 2.3578947368421054e-05, "loss": 0.6648, "step": 84 }, { "epoch": 0.14938488576449913, "grad_norm": 0.8899304578688226, "learning_rate": 2.385964912280702e-05, "loss": 0.6524, "step": 85 }, { "epoch": 0.15114235500878734, "grad_norm": 1.2428495322991662, "learning_rate": 2.4140350877192985e-05, "loss": 0.6579, "step": 86 }, { "epoch": 0.15289982425307558, "grad_norm": 0.8926997595133822, "learning_rate": 2.442105263157895e-05, "loss": 0.6675, "step": 87 }, { "epoch": 0.1546572934973638, "grad_norm": 1.2251500809038784, "learning_rate": 2.4701754385964917e-05, "loss": 0.6523, "step": 88 }, { "epoch": 0.15641476274165203, "grad_norm": 0.8956090219680518, "learning_rate": 2.4982456140350882e-05, "loss": 0.6589, "step": 89 }, { "epoch": 0.15817223198594024, "grad_norm": 1.048453543920765, "learning_rate": 2.526315789473684e-05, "loss": 0.669, "step": 90 }, { "epoch": 0.15992970123022848, "grad_norm": 1.3246125744327673, "learning_rate": 2.5543859649122807e-05, "loss": 0.6593, "step": 91 }, { "epoch": 0.1616871704745167, "grad_norm": 1.013303315170601, "learning_rate": 2.5824561403508773e-05, "loss": 0.6623, "step": 92 }, { "epoch": 0.1634446397188049, "grad_norm": 1.2405607317131075, "learning_rate": 2.610526315789474e-05, "loss": 0.6532, "step": 93 }, { "epoch": 0.16520210896309315, "grad_norm": 1.290473135632496, "learning_rate": 2.6385964912280708e-05, "loss": 0.6412, "step": 94 }, { "epoch": 0.16695957820738136, "grad_norm": 1.213809492300772, "learning_rate": 2.6666666666666667e-05, "loss": 0.659, "step": 95 }, { "epoch": 0.1687170474516696, "grad_norm": 0.8448432424287885, "learning_rate": 2.6947368421052632e-05, "loss": 0.6575, "step": 96 }, { "epoch": 0.1704745166959578, "grad_norm": 0.851382375299901, "learning_rate": 2.7228070175438598e-05, "loss": 0.6531, "step": 97 }, { "epoch": 0.17223198594024605, "grad_norm": 1.1649996318908298, "learning_rate": 2.7508771929824564e-05, "loss": 0.6508, "step": 98 }, { "epoch": 0.17398945518453426, "grad_norm": 1.0692237080745912, "learning_rate": 2.778947368421053e-05, "loss": 0.6485, "step": 99 }, { "epoch": 0.1757469244288225, "grad_norm": 1.044106921655763, "learning_rate": 2.8070175438596492e-05, "loss": 0.6466, "step": 100 }, { "epoch": 0.17750439367311072, "grad_norm": 1.0779799162535695, "learning_rate": 2.8350877192982458e-05, "loss": 0.658, "step": 101 }, { "epoch": 0.17926186291739896, "grad_norm": 1.0961299570412804, "learning_rate": 2.8631578947368423e-05, "loss": 0.6666, "step": 102 }, { "epoch": 0.18101933216168717, "grad_norm": 1.295799486529893, "learning_rate": 2.891228070175439e-05, "loss": 0.6529, "step": 103 }, { "epoch": 0.1827768014059754, "grad_norm": 1.1679761660413621, "learning_rate": 2.9192982456140355e-05, "loss": 0.6312, "step": 104 }, { "epoch": 0.18453427065026362, "grad_norm": 1.1071477453915857, "learning_rate": 2.9473684210526317e-05, "loss": 0.6323, "step": 105 }, { "epoch": 0.18629173989455183, "grad_norm": 1.448426466534486, "learning_rate": 2.9754385964912283e-05, "loss": 0.639, "step": 106 }, { "epoch": 0.18804920913884007, "grad_norm": 0.8776342825780796, "learning_rate": 3.003508771929825e-05, "loss": 0.6489, "step": 107 }, { "epoch": 0.18980667838312829, "grad_norm": 1.2056153662661304, "learning_rate": 3.0315789473684214e-05, "loss": 0.6307, "step": 108 }, { "epoch": 0.19156414762741653, "grad_norm": 1.0173259574965863, "learning_rate": 3.059649122807018e-05, "loss": 0.6462, "step": 109 }, { "epoch": 0.19332161687170474, "grad_norm": 1.0518586335855524, "learning_rate": 3.087719298245614e-05, "loss": 0.6558, "step": 110 }, { "epoch": 0.19507908611599298, "grad_norm": 1.5760318449498316, "learning_rate": 3.1157894736842105e-05, "loss": 0.6489, "step": 111 }, { "epoch": 0.1968365553602812, "grad_norm": 0.8959525781892478, "learning_rate": 3.1438596491228074e-05, "loss": 0.6401, "step": 112 }, { "epoch": 0.19859402460456943, "grad_norm": 1.5156436917474452, "learning_rate": 3.1719298245614036e-05, "loss": 0.6415, "step": 113 }, { "epoch": 0.20035149384885764, "grad_norm": 1.2236131713603526, "learning_rate": 3.2000000000000005e-05, "loss": 0.6474, "step": 114 }, { "epoch": 0.20210896309314588, "grad_norm": 0.9991877143261114, "learning_rate": 3.228070175438597e-05, "loss": 0.6369, "step": 115 }, { "epoch": 0.2038664323374341, "grad_norm": 1.4304191136642472, "learning_rate": 3.256140350877193e-05, "loss": 0.6544, "step": 116 }, { "epoch": 0.2056239015817223, "grad_norm": 1.1773773405710453, "learning_rate": 3.28421052631579e-05, "loss": 0.6493, "step": 117 }, { "epoch": 0.20738137082601055, "grad_norm": 1.355645699168796, "learning_rate": 3.312280701754386e-05, "loss": 0.6495, "step": 118 }, { "epoch": 0.20913884007029876, "grad_norm": 0.9194528547743976, "learning_rate": 3.340350877192983e-05, "loss": 0.6373, "step": 119 }, { "epoch": 0.210896309314587, "grad_norm": 1.2418939563030513, "learning_rate": 3.368421052631579e-05, "loss": 0.6434, "step": 120 }, { "epoch": 0.2126537785588752, "grad_norm": 1.0436264657053904, "learning_rate": 3.3964912280701755e-05, "loss": 0.6425, "step": 121 }, { "epoch": 0.21441124780316345, "grad_norm": 1.19962826615495, "learning_rate": 3.4245614035087724e-05, "loss": 0.6383, "step": 122 }, { "epoch": 0.21616871704745166, "grad_norm": 1.5871745465183649, "learning_rate": 3.452631578947369e-05, "loss": 0.6309, "step": 123 }, { "epoch": 0.2179261862917399, "grad_norm": 0.7964869628379507, "learning_rate": 3.4807017543859656e-05, "loss": 0.6356, "step": 124 }, { "epoch": 0.21968365553602812, "grad_norm": 1.0083647936384401, "learning_rate": 3.508771929824562e-05, "loss": 0.6409, "step": 125 }, { "epoch": 0.22144112478031636, "grad_norm": 1.7762567515903804, "learning_rate": 3.536842105263158e-05, "loss": 0.6399, "step": 126 }, { "epoch": 0.22319859402460457, "grad_norm": 0.9816826978484458, "learning_rate": 3.564912280701755e-05, "loss": 0.6302, "step": 127 }, { "epoch": 0.22495606326889278, "grad_norm": 1.6898657249849012, "learning_rate": 3.592982456140351e-05, "loss": 0.6496, "step": 128 }, { "epoch": 0.22671353251318102, "grad_norm": 1.184231796114549, "learning_rate": 3.621052631578948e-05, "loss": 0.6433, "step": 129 }, { "epoch": 0.22847100175746923, "grad_norm": 1.561695206006119, "learning_rate": 3.649122807017544e-05, "loss": 0.6576, "step": 130 }, { "epoch": 0.23022847100175747, "grad_norm": 1.2696212261159592, "learning_rate": 3.6771929824561406e-05, "loss": 0.6377, "step": 131 }, { "epoch": 0.23198594024604569, "grad_norm": 1.1174310715304931, "learning_rate": 3.705263157894737e-05, "loss": 0.6394, "step": 132 }, { "epoch": 0.23374340949033393, "grad_norm": 1.2969434678596565, "learning_rate": 3.733333333333334e-05, "loss": 0.633, "step": 133 }, { "epoch": 0.23550087873462214, "grad_norm": 1.157726557408529, "learning_rate": 3.76140350877193e-05, "loss": 0.6344, "step": 134 }, { "epoch": 0.23725834797891038, "grad_norm": 1.4169406190789724, "learning_rate": 3.789473684210526e-05, "loss": 0.6521, "step": 135 }, { "epoch": 0.2390158172231986, "grad_norm": 1.141058703201503, "learning_rate": 3.817543859649123e-05, "loss": 0.6346, "step": 136 }, { "epoch": 0.24077328646748683, "grad_norm": 1.3208751142286084, "learning_rate": 3.8456140350877193e-05, "loss": 0.636, "step": 137 }, { "epoch": 0.24253075571177504, "grad_norm": 1.0095380884046672, "learning_rate": 3.873684210526316e-05, "loss": 0.6331, "step": 138 }, { "epoch": 0.24428822495606328, "grad_norm": 0.9917642466886244, "learning_rate": 3.9017543859649125e-05, "loss": 0.6276, "step": 139 }, { "epoch": 0.2460456942003515, "grad_norm": 1.3485336042381195, "learning_rate": 3.929824561403509e-05, "loss": 0.6202, "step": 140 }, { "epoch": 0.2478031634446397, "grad_norm": 1.2527662426979078, "learning_rate": 3.9578947368421056e-05, "loss": 0.6227, "step": 141 }, { "epoch": 0.24956063268892795, "grad_norm": 1.206522869565778, "learning_rate": 3.985964912280702e-05, "loss": 0.6104, "step": 142 }, { "epoch": 0.2513181019332162, "grad_norm": 0.9191553546385473, "learning_rate": 4.014035087719299e-05, "loss": 0.6368, "step": 143 }, { "epoch": 0.2530755711775044, "grad_norm": 1.3931273539092524, "learning_rate": 4.042105263157895e-05, "loss": 0.625, "step": 144 }, { "epoch": 0.2548330404217926, "grad_norm": 1.395421185473651, "learning_rate": 4.070175438596492e-05, "loss": 0.6512, "step": 145 }, { "epoch": 0.2565905096660808, "grad_norm": 0.9868056791268918, "learning_rate": 4.098245614035088e-05, "loss": 0.6364, "step": 146 }, { "epoch": 0.2583479789103691, "grad_norm": 1.649422438644548, "learning_rate": 4.126315789473685e-05, "loss": 0.6303, "step": 147 }, { "epoch": 0.2601054481546573, "grad_norm": 0.9514009928175235, "learning_rate": 4.1543859649122806e-05, "loss": 0.6448, "step": 148 }, { "epoch": 0.2618629173989455, "grad_norm": 1.5160454587449026, "learning_rate": 4.1824561403508775e-05, "loss": 0.6409, "step": 149 }, { "epoch": 0.26362038664323373, "grad_norm": 1.3281966586559268, "learning_rate": 4.210526315789474e-05, "loss": 0.6393, "step": 150 }, { "epoch": 0.26537785588752194, "grad_norm": 0.8768372460079714, "learning_rate": 4.238596491228071e-05, "loss": 0.6448, "step": 151 }, { "epoch": 0.2671353251318102, "grad_norm": 1.1838616575544094, "learning_rate": 4.266666666666667e-05, "loss": 0.6196, "step": 152 }, { "epoch": 0.2688927943760984, "grad_norm": 1.783695151236225, "learning_rate": 4.294736842105264e-05, "loss": 0.6456, "step": 153 }, { "epoch": 0.27065026362038663, "grad_norm": 0.9572815606993466, "learning_rate": 4.32280701754386e-05, "loss": 0.6314, "step": 154 }, { "epoch": 0.27240773286467485, "grad_norm": 1.095245677696617, "learning_rate": 4.350877192982457e-05, "loss": 0.6293, "step": 155 }, { "epoch": 0.2741652021089631, "grad_norm": 1.131595340920893, "learning_rate": 4.378947368421053e-05, "loss": 0.6257, "step": 156 }, { "epoch": 0.2759226713532513, "grad_norm": 0.9862662021095853, "learning_rate": 4.40701754385965e-05, "loss": 0.6476, "step": 157 }, { "epoch": 0.27768014059753954, "grad_norm": 1.4019090261001248, "learning_rate": 4.435087719298246e-05, "loss": 0.6204, "step": 158 }, { "epoch": 0.27943760984182775, "grad_norm": 1.3027267404408884, "learning_rate": 4.463157894736842e-05, "loss": 0.6372, "step": 159 }, { "epoch": 0.281195079086116, "grad_norm": 1.5077176587280812, "learning_rate": 4.491228070175439e-05, "loss": 0.6209, "step": 160 }, { "epoch": 0.28295254833040423, "grad_norm": 0.7505715459466871, "learning_rate": 4.519298245614035e-05, "loss": 0.6135, "step": 161 }, { "epoch": 0.28471001757469244, "grad_norm": 1.3067121738068124, "learning_rate": 4.547368421052632e-05, "loss": 0.6287, "step": 162 }, { "epoch": 0.28646748681898065, "grad_norm": 0.7871371290185103, "learning_rate": 4.575438596491228e-05, "loss": 0.6236, "step": 163 }, { "epoch": 0.28822495606326887, "grad_norm": 1.2432313859013742, "learning_rate": 4.603508771929825e-05, "loss": 0.621, "step": 164 }, { "epoch": 0.28998242530755713, "grad_norm": 1.4230679911580981, "learning_rate": 4.6315789473684214e-05, "loss": 0.6298, "step": 165 }, { "epoch": 0.29173989455184535, "grad_norm": 1.5013078526289187, "learning_rate": 4.659649122807018e-05, "loss": 0.6196, "step": 166 }, { "epoch": 0.29349736379613356, "grad_norm": 0.785419048723026, "learning_rate": 4.6877192982456145e-05, "loss": 0.6173, "step": 167 }, { "epoch": 0.29525483304042177, "grad_norm": 1.2159193090324154, "learning_rate": 4.715789473684211e-05, "loss": 0.6263, "step": 168 }, { "epoch": 0.29701230228471004, "grad_norm": 1.54668161272807, "learning_rate": 4.743859649122807e-05, "loss": 0.6274, "step": 169 }, { "epoch": 0.29876977152899825, "grad_norm": 1.1903579828018356, "learning_rate": 4.771929824561404e-05, "loss": 0.6124, "step": 170 }, { "epoch": 0.30052724077328646, "grad_norm": 1.613932425684105, "learning_rate": 4.8e-05, "loss": 0.6221, "step": 171 }, { "epoch": 0.3022847100175747, "grad_norm": 1.1001604556712876, "learning_rate": 4.828070175438597e-05, "loss": 0.6198, "step": 172 }, { "epoch": 0.30404217926186294, "grad_norm": 1.185935515206137, "learning_rate": 4.856140350877193e-05, "loss": 0.6359, "step": 173 }, { "epoch": 0.30579964850615116, "grad_norm": 1.4658562557403336, "learning_rate": 4.88421052631579e-05, "loss": 0.621, "step": 174 }, { "epoch": 0.30755711775043937, "grad_norm": 1.5409970025392272, "learning_rate": 4.9122807017543864e-05, "loss": 0.6222, "step": 175 }, { "epoch": 0.3093145869947276, "grad_norm": 0.8777047843849799, "learning_rate": 4.940350877192983e-05, "loss": 0.6403, "step": 176 }, { "epoch": 0.3110720562390158, "grad_norm": 1.0624356996098892, "learning_rate": 4.9684210526315796e-05, "loss": 0.6247, "step": 177 }, { "epoch": 0.31282952548330406, "grad_norm": 1.371915802445492, "learning_rate": 4.9964912280701765e-05, "loss": 0.6251, "step": 178 }, { "epoch": 0.3145869947275923, "grad_norm": 0.9761842230597857, "learning_rate": 5.024561403508772e-05, "loss": 0.6273, "step": 179 }, { "epoch": 0.3163444639718805, "grad_norm": 1.8571514761016172, "learning_rate": 5.052631578947368e-05, "loss": 0.6272, "step": 180 }, { "epoch": 0.3181019332161687, "grad_norm": 1.0988988716364734, "learning_rate": 5.080701754385965e-05, "loss": 0.6159, "step": 181 }, { "epoch": 0.31985940246045697, "grad_norm": 1.5718193088039054, "learning_rate": 5.1087719298245614e-05, "loss": 0.617, "step": 182 }, { "epoch": 0.3216168717047452, "grad_norm": 1.6538672725071402, "learning_rate": 5.136842105263158e-05, "loss": 0.6132, "step": 183 }, { "epoch": 0.3233743409490334, "grad_norm": 1.0959616874694302, "learning_rate": 5.1649122807017546e-05, "loss": 0.6219, "step": 184 }, { "epoch": 0.3251318101933216, "grad_norm": 1.7150198990138388, "learning_rate": 5.1929824561403515e-05, "loss": 0.6352, "step": 185 }, { "epoch": 0.3268892794376098, "grad_norm": 1.4813164959439133, "learning_rate": 5.221052631578948e-05, "loss": 0.6203, "step": 186 }, { "epoch": 0.3286467486818981, "grad_norm": 1.278420678629466, "learning_rate": 5.2491228070175446e-05, "loss": 0.6226, "step": 187 }, { "epoch": 0.3304042179261863, "grad_norm": 1.5945177218928257, "learning_rate": 5.2771929824561415e-05, "loss": 0.6151, "step": 188 }, { "epoch": 0.3321616871704745, "grad_norm": 1.1614109824916725, "learning_rate": 5.305263157894737e-05, "loss": 0.6198, "step": 189 }, { "epoch": 0.3339191564147627, "grad_norm": 1.7278412933182057, "learning_rate": 5.333333333333333e-05, "loss": 0.6383, "step": 190 }, { "epoch": 0.335676625659051, "grad_norm": 1.0132272225622532, "learning_rate": 5.36140350877193e-05, "loss": 0.6289, "step": 191 }, { "epoch": 0.3374340949033392, "grad_norm": 1.982475911255453, "learning_rate": 5.3894736842105265e-05, "loss": 0.6323, "step": 192 }, { "epoch": 0.3391915641476274, "grad_norm": 1.6445068721944456, "learning_rate": 5.4175438596491234e-05, "loss": 0.6349, "step": 193 }, { "epoch": 0.3409490333919156, "grad_norm": 1.525126105130713, "learning_rate": 5.4456140350877196e-05, "loss": 0.6107, "step": 194 }, { "epoch": 0.3427065026362039, "grad_norm": 1.4371881520430103, "learning_rate": 5.4736842105263165e-05, "loss": 0.6299, "step": 195 }, { "epoch": 0.3444639718804921, "grad_norm": 1.3756040094861095, "learning_rate": 5.501754385964913e-05, "loss": 0.6352, "step": 196 }, { "epoch": 0.3462214411247803, "grad_norm": 1.1402730847659939, "learning_rate": 5.52982456140351e-05, "loss": 0.6152, "step": 197 }, { "epoch": 0.34797891036906853, "grad_norm": 1.8048737419262584, "learning_rate": 5.557894736842106e-05, "loss": 0.6184, "step": 198 }, { "epoch": 0.34973637961335674, "grad_norm": 1.086258651919308, "learning_rate": 5.585964912280702e-05, "loss": 0.6272, "step": 199 }, { "epoch": 0.351493848857645, "grad_norm": 1.4347016176632732, "learning_rate": 5.6140350877192984e-05, "loss": 0.6174, "step": 200 }, { "epoch": 0.3532513181019332, "grad_norm": 1.1894318110437299, "learning_rate": 5.642105263157895e-05, "loss": 0.63, "step": 201 }, { "epoch": 0.35500878734622143, "grad_norm": 1.5698764932484144, "learning_rate": 5.6701754385964915e-05, "loss": 0.6158, "step": 202 }, { "epoch": 0.35676625659050965, "grad_norm": 1.2310068549895534, "learning_rate": 5.6982456140350884e-05, "loss": 0.626, "step": 203 }, { "epoch": 0.3585237258347979, "grad_norm": 1.3741741342051308, "learning_rate": 5.726315789473685e-05, "loss": 0.6225, "step": 204 }, { "epoch": 0.3602811950790861, "grad_norm": 1.2609892441476567, "learning_rate": 5.7543859649122816e-05, "loss": 0.6288, "step": 205 }, { "epoch": 0.36203866432337434, "grad_norm": 1.3304073345934666, "learning_rate": 5.782456140350878e-05, "loss": 0.6194, "step": 206 }, { "epoch": 0.36379613356766255, "grad_norm": 1.1505490601780037, "learning_rate": 5.810526315789475e-05, "loss": 0.6302, "step": 207 }, { "epoch": 0.3655536028119508, "grad_norm": 1.030739822657584, "learning_rate": 5.838596491228071e-05, "loss": 0.6135, "step": 208 }, { "epoch": 0.36731107205623903, "grad_norm": 1.1829089997445763, "learning_rate": 5.8666666666666665e-05, "loss": 0.64, "step": 209 }, { "epoch": 0.36906854130052724, "grad_norm": 1.8486895586749998, "learning_rate": 5.8947368421052634e-05, "loss": 0.6149, "step": 210 }, { "epoch": 0.37082601054481545, "grad_norm": 1.212469962924691, "learning_rate": 5.9228070175438597e-05, "loss": 0.6019, "step": 211 }, { "epoch": 0.37258347978910367, "grad_norm": 1.7602511247560482, "learning_rate": 5.9508771929824566e-05, "loss": 0.6266, "step": 212 }, { "epoch": 0.37434094903339193, "grad_norm": 1.1301360484834986, "learning_rate": 5.978947368421053e-05, "loss": 0.6116, "step": 213 }, { "epoch": 0.37609841827768015, "grad_norm": 0.8936891718540865, "learning_rate": 6.00701754385965e-05, "loss": 0.6078, "step": 214 }, { "epoch": 0.37785588752196836, "grad_norm": 1.952472585549605, "learning_rate": 6.035087719298246e-05, "loss": 0.635, "step": 215 }, { "epoch": 0.37961335676625657, "grad_norm": 1.0483082837538027, "learning_rate": 6.063157894736843e-05, "loss": 0.6193, "step": 216 }, { "epoch": 0.38137082601054484, "grad_norm": 1.7582717056124453, "learning_rate": 6.091228070175439e-05, "loss": 0.6265, "step": 217 }, { "epoch": 0.38312829525483305, "grad_norm": 1.2430354594341573, "learning_rate": 6.119298245614036e-05, "loss": 0.6176, "step": 218 }, { "epoch": 0.38488576449912126, "grad_norm": 1.3834918157657696, "learning_rate": 6.147368421052632e-05, "loss": 0.6044, "step": 219 }, { "epoch": 0.3866432337434095, "grad_norm": 1.2168876691958734, "learning_rate": 6.175438596491228e-05, "loss": 0.6202, "step": 220 }, { "epoch": 0.3884007029876977, "grad_norm": 1.8016829814809519, "learning_rate": 6.203508771929825e-05, "loss": 0.6114, "step": 221 }, { "epoch": 0.39015817223198596, "grad_norm": 0.9923069876961094, "learning_rate": 6.231578947368421e-05, "loss": 0.6069, "step": 222 }, { "epoch": 0.39191564147627417, "grad_norm": 2.025541891664656, "learning_rate": 6.259649122807018e-05, "loss": 0.617, "step": 223 }, { "epoch": 0.3936731107205624, "grad_norm": 1.5809142885535594, "learning_rate": 6.287719298245615e-05, "loss": 0.6095, "step": 224 }, { "epoch": 0.3954305799648506, "grad_norm": 1.6543220566703887, "learning_rate": 6.315789473684212e-05, "loss": 0.612, "step": 225 }, { "epoch": 0.39718804920913886, "grad_norm": 1.2237693348202479, "learning_rate": 6.343859649122807e-05, "loss": 0.6172, "step": 226 }, { "epoch": 0.3989455184534271, "grad_norm": 1.9446008312681025, "learning_rate": 6.371929824561404e-05, "loss": 0.6262, "step": 227 }, { "epoch": 0.4007029876977153, "grad_norm": 1.447594805070642, "learning_rate": 6.400000000000001e-05, "loss": 0.615, "step": 228 }, { "epoch": 0.4024604569420035, "grad_norm": 1.3386904710634997, "learning_rate": 6.428070175438598e-05, "loss": 0.6174, "step": 229 }, { "epoch": 0.40421792618629176, "grad_norm": 1.4589542007723908, "learning_rate": 6.456140350877194e-05, "loss": 0.6098, "step": 230 }, { "epoch": 0.40597539543058, "grad_norm": 1.4130867169351466, "learning_rate": 6.484210526315789e-05, "loss": 0.6072, "step": 231 }, { "epoch": 0.4077328646748682, "grad_norm": 1.113059304103117, "learning_rate": 6.512280701754386e-05, "loss": 0.6053, "step": 232 }, { "epoch": 0.4094903339191564, "grad_norm": 2.070723286531759, "learning_rate": 6.540350877192983e-05, "loss": 0.6071, "step": 233 }, { "epoch": 0.4112478031634446, "grad_norm": 0.9927980066936902, "learning_rate": 6.56842105263158e-05, "loss": 0.6125, "step": 234 }, { "epoch": 0.4130052724077329, "grad_norm": 1.2033395537919045, "learning_rate": 6.596491228070175e-05, "loss": 0.621, "step": 235 }, { "epoch": 0.4147627416520211, "grad_norm": 1.740873820128363, "learning_rate": 6.624561403508772e-05, "loss": 0.6108, "step": 236 }, { "epoch": 0.4165202108963093, "grad_norm": 1.6531920238439015, "learning_rate": 6.652631578947369e-05, "loss": 0.6027, "step": 237 }, { "epoch": 0.4182776801405975, "grad_norm": 0.8294904354111311, "learning_rate": 6.680701754385966e-05, "loss": 0.6092, "step": 238 }, { "epoch": 0.4200351493848858, "grad_norm": 1.6977554447763887, "learning_rate": 6.708771929824563e-05, "loss": 0.5972, "step": 239 }, { "epoch": 0.421792618629174, "grad_norm": 1.786353753602148, "learning_rate": 6.736842105263159e-05, "loss": 0.6062, "step": 240 }, { "epoch": 0.4235500878734622, "grad_norm": 1.0102848708260133, "learning_rate": 6.764912280701754e-05, "loss": 0.6031, "step": 241 }, { "epoch": 0.4253075571177504, "grad_norm": 1.7844432885524308, "learning_rate": 6.792982456140351e-05, "loss": 0.6152, "step": 242 }, { "epoch": 0.4270650263620387, "grad_norm": 1.3601408668338957, "learning_rate": 6.821052631578948e-05, "loss": 0.6186, "step": 243 }, { "epoch": 0.4288224956063269, "grad_norm": 1.1455946826383585, "learning_rate": 6.849122807017545e-05, "loss": 0.6166, "step": 244 }, { "epoch": 0.4305799648506151, "grad_norm": 1.231098953395264, "learning_rate": 6.87719298245614e-05, "loss": 0.6234, "step": 245 }, { "epoch": 0.43233743409490333, "grad_norm": 1.7361046316402804, "learning_rate": 6.905263157894737e-05, "loss": 0.6332, "step": 246 }, { "epoch": 0.43409490333919154, "grad_norm": 1.1433770431001673, "learning_rate": 6.933333333333334e-05, "loss": 0.5903, "step": 247 }, { "epoch": 0.4358523725834798, "grad_norm": 1.0748161014527775, "learning_rate": 6.961403508771931e-05, "loss": 0.6125, "step": 248 }, { "epoch": 0.437609841827768, "grad_norm": 1.421608476165314, "learning_rate": 6.989473684210527e-05, "loss": 0.6095, "step": 249 }, { "epoch": 0.43936731107205623, "grad_norm": 1.1319617627242087, "learning_rate": 7.017543859649124e-05, "loss": 0.6177, "step": 250 }, { "epoch": 0.44112478031634444, "grad_norm": 1.5877406653183856, "learning_rate": 7.045614035087719e-05, "loss": 0.6128, "step": 251 }, { "epoch": 0.4428822495606327, "grad_norm": 1.4341482616429166, "learning_rate": 7.073684210526316e-05, "loss": 0.6, "step": 252 }, { "epoch": 0.4446397188049209, "grad_norm": 1.0638255791175584, "learning_rate": 7.101754385964913e-05, "loss": 0.5971, "step": 253 }, { "epoch": 0.44639718804920914, "grad_norm": 1.7574748602071937, "learning_rate": 7.12982456140351e-05, "loss": 0.6241, "step": 254 }, { "epoch": 0.44815465729349735, "grad_norm": 1.067090429245322, "learning_rate": 7.157894736842105e-05, "loss": 0.611, "step": 255 }, { "epoch": 0.44991212653778556, "grad_norm": 1.4333019082561398, "learning_rate": 7.185964912280702e-05, "loss": 0.6083, "step": 256 }, { "epoch": 0.45166959578207383, "grad_norm": 1.4211854842235638, "learning_rate": 7.214035087719299e-05, "loss": 0.6229, "step": 257 }, { "epoch": 0.45342706502636204, "grad_norm": 1.4259574197617761, "learning_rate": 7.242105263157896e-05, "loss": 0.6071, "step": 258 }, { "epoch": 0.45518453427065025, "grad_norm": 0.9653745392313948, "learning_rate": 7.270175438596492e-05, "loss": 0.6205, "step": 259 }, { "epoch": 0.45694200351493847, "grad_norm": 1.6025670157477026, "learning_rate": 7.298245614035087e-05, "loss": 0.6192, "step": 260 }, { "epoch": 0.45869947275922673, "grad_norm": 1.6614682007977808, "learning_rate": 7.326315789473684e-05, "loss": 0.5988, "step": 261 }, { "epoch": 0.46045694200351495, "grad_norm": 0.9329050866616899, "learning_rate": 7.354385964912281e-05, "loss": 0.6004, "step": 262 }, { "epoch": 0.46221441124780316, "grad_norm": 1.573549071969887, "learning_rate": 7.382456140350878e-05, "loss": 0.613, "step": 263 }, { "epoch": 0.46397188049209137, "grad_norm": 0.9992626412675855, "learning_rate": 7.410526315789474e-05, "loss": 0.5939, "step": 264 }, { "epoch": 0.46572934973637964, "grad_norm": 1.5809525337684551, "learning_rate": 7.43859649122807e-05, "loss": 0.6212, "step": 265 }, { "epoch": 0.46748681898066785, "grad_norm": 1.241402514449554, "learning_rate": 7.466666666666667e-05, "loss": 0.6277, "step": 266 }, { "epoch": 0.46924428822495606, "grad_norm": 1.169298361233409, "learning_rate": 7.494736842105264e-05, "loss": 0.6043, "step": 267 }, { "epoch": 0.4710017574692443, "grad_norm": 1.288809429608367, "learning_rate": 7.52280701754386e-05, "loss": 0.6106, "step": 268 }, { "epoch": 0.4727592267135325, "grad_norm": 1.400099158529274, "learning_rate": 7.550877192982457e-05, "loss": 0.5791, "step": 269 }, { "epoch": 0.47451669595782076, "grad_norm": 1.464689412269974, "learning_rate": 7.578947368421052e-05, "loss": 0.5756, "step": 270 }, { "epoch": 0.47627416520210897, "grad_norm": 0.9866380498196698, "learning_rate": 7.607017543859649e-05, "loss": 0.6062, "step": 271 }, { "epoch": 0.4780316344463972, "grad_norm": 1.6757567652942478, "learning_rate": 7.635087719298246e-05, "loss": 0.6134, "step": 272 }, { "epoch": 0.4797891036906854, "grad_norm": 1.1751526474970524, "learning_rate": 7.663157894736843e-05, "loss": 0.591, "step": 273 }, { "epoch": 0.48154657293497366, "grad_norm": 1.3090593352893534, "learning_rate": 7.691228070175439e-05, "loss": 0.5977, "step": 274 }, { "epoch": 0.4833040421792619, "grad_norm": 1.1865472399725696, "learning_rate": 7.719298245614036e-05, "loss": 0.5966, "step": 275 }, { "epoch": 0.4850615114235501, "grad_norm": 1.582102396035751, "learning_rate": 7.747368421052633e-05, "loss": 0.6105, "step": 276 }, { "epoch": 0.4868189806678383, "grad_norm": 1.113997391226846, "learning_rate": 7.77543859649123e-05, "loss": 0.5934, "step": 277 }, { "epoch": 0.48857644991212656, "grad_norm": 1.3359174507752907, "learning_rate": 7.803508771929825e-05, "loss": 0.6046, "step": 278 }, { "epoch": 0.4903339191564148, "grad_norm": 1.569503853790641, "learning_rate": 7.831578947368422e-05, "loss": 0.5989, "step": 279 }, { "epoch": 0.492091388400703, "grad_norm": 1.0119767744065404, "learning_rate": 7.859649122807017e-05, "loss": 0.603, "step": 280 }, { "epoch": 0.4938488576449912, "grad_norm": 1.5166079588162267, "learning_rate": 7.887719298245614e-05, "loss": 0.6253, "step": 281 }, { "epoch": 0.4956063268892794, "grad_norm": 1.0783562606342447, "learning_rate": 7.915789473684211e-05, "loss": 0.6003, "step": 282 }, { "epoch": 0.4973637961335677, "grad_norm": 1.2282928987350967, "learning_rate": 7.943859649122807e-05, "loss": 0.6133, "step": 283 }, { "epoch": 0.4991212653778559, "grad_norm": 1.1905187117111355, "learning_rate": 7.971929824561404e-05, "loss": 0.6174, "step": 284 }, { "epoch": 0.5008787346221442, "grad_norm": 1.5514788651914946, "learning_rate": 8e-05, "loss": 0.6107, "step": 285 }, { "epoch": 0.5026362038664324, "grad_norm": 1.3265526891123802, "learning_rate": 7.999996988036145e-05, "loss": 0.6043, "step": 286 }, { "epoch": 0.5043936731107206, "grad_norm": 1.263587982363945, "learning_rate": 7.999987952149114e-05, "loss": 0.6116, "step": 287 }, { "epoch": 0.5061511423550088, "grad_norm": 1.3248853265547613, "learning_rate": 7.999972892352515e-05, "loss": 0.5846, "step": 288 }, { "epoch": 0.507908611599297, "grad_norm": 0.9206754533797502, "learning_rate": 7.999951808669029e-05, "loss": 0.6108, "step": 289 }, { "epoch": 0.5096660808435852, "grad_norm": 0.9446208195476323, "learning_rate": 7.999924701130405e-05, "loss": 0.5981, "step": 290 }, { "epoch": 0.5114235500878734, "grad_norm": 1.8643625949483962, "learning_rate": 7.99989156977747e-05, "loss": 0.6246, "step": 291 }, { "epoch": 0.5131810193321616, "grad_norm": 1.2170434853606356, "learning_rate": 7.999852414660116e-05, "loss": 0.6092, "step": 292 }, { "epoch": 0.5149384885764499, "grad_norm": 1.6368158418279068, "learning_rate": 7.999807235837312e-05, "loss": 0.6127, "step": 293 }, { "epoch": 0.5166959578207382, "grad_norm": 0.9889845767994027, "learning_rate": 7.999756033377097e-05, "loss": 0.6117, "step": 294 }, { "epoch": 0.5184534270650264, "grad_norm": 1.5398038252040316, "learning_rate": 7.99969880735658e-05, "loss": 0.614, "step": 295 }, { "epoch": 0.5202108963093146, "grad_norm": 1.1162316375549377, "learning_rate": 7.99963555786194e-05, "loss": 0.6212, "step": 296 }, { "epoch": 0.5219683655536028, "grad_norm": 1.2265764878314303, "learning_rate": 7.999566284988434e-05, "loss": 0.6161, "step": 297 }, { "epoch": 0.523725834797891, "grad_norm": 1.0971674963605422, "learning_rate": 7.999490988840382e-05, "loss": 0.6023, "step": 298 }, { "epoch": 0.5254833040421792, "grad_norm": 1.3240240268384225, "learning_rate": 7.99940966953118e-05, "loss": 0.5863, "step": 299 }, { "epoch": 0.5272407732864675, "grad_norm": 1.6279901474263196, "learning_rate": 7.999322327183294e-05, "loss": 0.6046, "step": 300 }, { "epoch": 0.5289982425307557, "grad_norm": 0.8492530186809667, "learning_rate": 7.999228961928259e-05, "loss": 0.6235, "step": 301 }, { "epoch": 0.5307557117750439, "grad_norm": 1.5720699069453548, "learning_rate": 7.999129573906684e-05, "loss": 0.6142, "step": 302 }, { "epoch": 0.5325131810193322, "grad_norm": 1.237753209552606, "learning_rate": 7.999024163268242e-05, "loss": 0.6139, "step": 303 }, { "epoch": 0.5342706502636204, "grad_norm": 1.1619896855186203, "learning_rate": 7.998912730171681e-05, "loss": 0.599, "step": 304 }, { "epoch": 0.5360281195079086, "grad_norm": 1.8045355824915206, "learning_rate": 7.998795274784818e-05, "loss": 0.5994, "step": 305 }, { "epoch": 0.5377855887521968, "grad_norm": 0.9023715327473726, "learning_rate": 7.998671797284536e-05, "loss": 0.6009, "step": 306 }, { "epoch": 0.539543057996485, "grad_norm": 1.646842285185737, "learning_rate": 7.998542297856794e-05, "loss": 0.6273, "step": 307 }, { "epoch": 0.5413005272407733, "grad_norm": 1.3392682436060837, "learning_rate": 7.998406776696612e-05, "loss": 0.6061, "step": 308 }, { "epoch": 0.5430579964850615, "grad_norm": 1.2740183583247389, "learning_rate": 7.998265234008086e-05, "loss": 0.6155, "step": 309 }, { "epoch": 0.5448154657293497, "grad_norm": 1.0706123274832355, "learning_rate": 7.998117670004374e-05, "loss": 0.5919, "step": 310 }, { "epoch": 0.546572934973638, "grad_norm": 1.755766690549818, "learning_rate": 7.997964084907704e-05, "loss": 0.5943, "step": 311 }, { "epoch": 0.5483304042179262, "grad_norm": 0.8873900288476679, "learning_rate": 7.997804478949375e-05, "loss": 0.6122, "step": 312 }, { "epoch": 0.5500878734622144, "grad_norm": 1.5365267293559204, "learning_rate": 7.99763885236975e-05, "loss": 0.5985, "step": 313 }, { "epoch": 0.5518453427065027, "grad_norm": 1.0197470217126197, "learning_rate": 7.997467205418259e-05, "loss": 0.5875, "step": 314 }, { "epoch": 0.5536028119507909, "grad_norm": 1.3661306979985832, "learning_rate": 7.997289538353399e-05, "loss": 0.6027, "step": 315 }, { "epoch": 0.5553602811950791, "grad_norm": 1.3748813016803847, "learning_rate": 7.997105851442734e-05, "loss": 0.6101, "step": 316 }, { "epoch": 0.5571177504393673, "grad_norm": 1.1736254334481004, "learning_rate": 7.996916144962893e-05, "loss": 0.5989, "step": 317 }, { "epoch": 0.5588752196836555, "grad_norm": 1.4948292669823051, "learning_rate": 7.99672041919957e-05, "loss": 0.5979, "step": 318 }, { "epoch": 0.5606326889279437, "grad_norm": 0.9761626970854795, "learning_rate": 7.996518674447525e-05, "loss": 0.5997, "step": 319 }, { "epoch": 0.562390158172232, "grad_norm": 1.1983093772541373, "learning_rate": 7.996310911010583e-05, "loss": 0.6102, "step": 320 }, { "epoch": 0.5641476274165202, "grad_norm": 0.7462130941647848, "learning_rate": 7.99609712920163e-05, "loss": 0.602, "step": 321 }, { "epoch": 0.5659050966608085, "grad_norm": 1.278803632499716, "learning_rate": 7.995877329342618e-05, "loss": 0.6002, "step": 322 }, { "epoch": 0.5676625659050967, "grad_norm": 1.3780384493596807, "learning_rate": 7.995651511764562e-05, "loss": 0.604, "step": 323 }, { "epoch": 0.5694200351493849, "grad_norm": 1.2389176932964108, "learning_rate": 7.99541967680754e-05, "loss": 0.5872, "step": 324 }, { "epoch": 0.5711775043936731, "grad_norm": 0.9138101698804183, "learning_rate": 7.99518182482069e-05, "loss": 0.5906, "step": 325 }, { "epoch": 0.5729349736379613, "grad_norm": 1.1339415797884769, "learning_rate": 7.994937956162214e-05, "loss": 0.5946, "step": 326 }, { "epoch": 0.5746924428822495, "grad_norm": 1.6491335309522757, "learning_rate": 7.994688071199373e-05, "loss": 0.6003, "step": 327 }, { "epoch": 0.5764499121265377, "grad_norm": 0.782642070937353, "learning_rate": 7.994432170308487e-05, "loss": 0.5973, "step": 328 }, { "epoch": 0.5782073813708261, "grad_norm": 0.8467422395625295, "learning_rate": 7.994170253874944e-05, "loss": 0.5783, "step": 329 }, { "epoch": 0.5799648506151143, "grad_norm": 1.4645664879234668, "learning_rate": 7.993902322293179e-05, "loss": 0.6005, "step": 330 }, { "epoch": 0.5817223198594025, "grad_norm": 1.3150055044922373, "learning_rate": 7.993628375966697e-05, "loss": 0.6029, "step": 331 }, { "epoch": 0.5834797891036907, "grad_norm": 1.1795117941048954, "learning_rate": 7.993348415308052e-05, "loss": 0.5968, "step": 332 }, { "epoch": 0.5852372583479789, "grad_norm": 1.0522189917135965, "learning_rate": 7.993062440738864e-05, "loss": 0.5909, "step": 333 }, { "epoch": 0.5869947275922671, "grad_norm": 1.511084303057692, "learning_rate": 7.992770452689803e-05, "loss": 0.5958, "step": 334 }, { "epoch": 0.5887521968365553, "grad_norm": 0.8906218538663991, "learning_rate": 7.992472451600596e-05, "loss": 0.6094, "step": 335 }, { "epoch": 0.5905096660808435, "grad_norm": 1.1670705842366254, "learning_rate": 7.992168437920033e-05, "loss": 0.5959, "step": 336 }, { "epoch": 0.5922671353251318, "grad_norm": 1.0438032314115804, "learning_rate": 7.991858412105947e-05, "loss": 0.5974, "step": 337 }, { "epoch": 0.5940246045694201, "grad_norm": 1.063442641841074, "learning_rate": 7.991542374625234e-05, "loss": 0.601, "step": 338 }, { "epoch": 0.5957820738137083, "grad_norm": 1.6271823907918928, "learning_rate": 7.991220325953841e-05, "loss": 0.6066, "step": 339 }, { "epoch": 0.5975395430579965, "grad_norm": 0.8863319416588012, "learning_rate": 7.990892266576768e-05, "loss": 0.6082, "step": 340 }, { "epoch": 0.5992970123022847, "grad_norm": 1.2081837296544964, "learning_rate": 7.990558196988064e-05, "loss": 0.6095, "step": 341 }, { "epoch": 0.6010544815465729, "grad_norm": 1.243173336061497, "learning_rate": 7.990218117690832e-05, "loss": 0.5947, "step": 342 }, { "epoch": 0.6028119507908611, "grad_norm": 1.2423372224522424, "learning_rate": 7.989872029197228e-05, "loss": 0.5868, "step": 343 }, { "epoch": 0.6045694200351494, "grad_norm": 0.9216117873100352, "learning_rate": 7.989519932028454e-05, "loss": 0.5914, "step": 344 }, { "epoch": 0.6063268892794376, "grad_norm": 0.7662368110663267, "learning_rate": 7.989161826714761e-05, "loss": 0.5746, "step": 345 }, { "epoch": 0.6080843585237259, "grad_norm": 1.0780558201345691, "learning_rate": 7.98879771379545e-05, "loss": 0.6083, "step": 346 }, { "epoch": 0.6098418277680141, "grad_norm": 1.8413542322439815, "learning_rate": 7.988427593818868e-05, "loss": 0.6174, "step": 347 }, { "epoch": 0.6115992970123023, "grad_norm": 0.9155977257183098, "learning_rate": 7.988051467342409e-05, "loss": 0.5985, "step": 348 }, { "epoch": 0.6133567662565905, "grad_norm": 1.5761353971439878, "learning_rate": 7.987669334932513e-05, "loss": 0.6122, "step": 349 }, { "epoch": 0.6151142355008787, "grad_norm": 1.0463483749562554, "learning_rate": 7.987281197164663e-05, "loss": 0.5815, "step": 350 }, { "epoch": 0.616871704745167, "grad_norm": 1.4425560371292374, "learning_rate": 7.98688705462339e-05, "loss": 0.6185, "step": 351 }, { "epoch": 0.6186291739894552, "grad_norm": 1.2024360761638095, "learning_rate": 7.986486907902263e-05, "loss": 0.5962, "step": 352 }, { "epoch": 0.6203866432337434, "grad_norm": 1.0256013478068695, "learning_rate": 7.9860807576039e-05, "loss": 0.5914, "step": 353 }, { "epoch": 0.6221441124780316, "grad_norm": 1.2891319978627767, "learning_rate": 7.98566860433995e-05, "loss": 0.6028, "step": 354 }, { "epoch": 0.6239015817223199, "grad_norm": 0.8832250667580478, "learning_rate": 7.985250448731112e-05, "loss": 0.5972, "step": 355 }, { "epoch": 0.6256590509666081, "grad_norm": 1.3333710242552175, "learning_rate": 7.984826291407121e-05, "loss": 0.5899, "step": 356 }, { "epoch": 0.6274165202108963, "grad_norm": 1.2393654129024474, "learning_rate": 7.98439613300675e-05, "loss": 0.5963, "step": 357 }, { "epoch": 0.6291739894551845, "grad_norm": 0.9149883644184567, "learning_rate": 7.983959974177808e-05, "loss": 0.5936, "step": 358 }, { "epoch": 0.6309314586994728, "grad_norm": 1.0782830084380892, "learning_rate": 7.983517815577144e-05, "loss": 0.6034, "step": 359 }, { "epoch": 0.632688927943761, "grad_norm": 0.6721067208016547, "learning_rate": 7.98306965787064e-05, "loss": 0.5767, "step": 360 }, { "epoch": 0.6344463971880492, "grad_norm": 0.9459565369577428, "learning_rate": 7.982615501733213e-05, "loss": 0.6046, "step": 361 }, { "epoch": 0.6362038664323374, "grad_norm": 1.1983934795892268, "learning_rate": 7.982155347848817e-05, "loss": 0.5923, "step": 362 }, { "epoch": 0.6379613356766256, "grad_norm": 1.0065435490586425, "learning_rate": 7.981689196910431e-05, "loss": 0.5862, "step": 363 }, { "epoch": 0.6397188049209139, "grad_norm": 1.190076666649886, "learning_rate": 7.981217049620074e-05, "loss": 0.6133, "step": 364 }, { "epoch": 0.6414762741652021, "grad_norm": 1.0892447933717468, "learning_rate": 7.980738906688788e-05, "loss": 0.5962, "step": 365 }, { "epoch": 0.6432337434094904, "grad_norm": 1.5543509039555978, "learning_rate": 7.98025476883665e-05, "loss": 0.598, "step": 366 }, { "epoch": 0.6449912126537786, "grad_norm": 0.7420905478082135, "learning_rate": 7.979764636792761e-05, "loss": 0.5914, "step": 367 }, { "epoch": 0.6467486818980668, "grad_norm": 0.9776546157788415, "learning_rate": 7.979268511295252e-05, "loss": 0.5939, "step": 368 }, { "epoch": 0.648506151142355, "grad_norm": 0.9406621299867856, "learning_rate": 7.978766393091278e-05, "loss": 0.5849, "step": 369 }, { "epoch": 0.6502636203866432, "grad_norm": 1.5995910868496745, "learning_rate": 7.978258282937022e-05, "loss": 0.585, "step": 370 }, { "epoch": 0.6520210896309314, "grad_norm": 0.6476813087147838, "learning_rate": 7.977744181597688e-05, "loss": 0.5949, "step": 371 }, { "epoch": 0.6537785588752196, "grad_norm": 1.108226231590569, "learning_rate": 7.977224089847502e-05, "loss": 0.6038, "step": 372 }, { "epoch": 0.655536028119508, "grad_norm": 1.6549661542625047, "learning_rate": 7.976698008469714e-05, "loss": 0.6117, "step": 373 }, { "epoch": 0.6572934973637962, "grad_norm": 0.5752379427755251, "learning_rate": 7.976165938256591e-05, "loss": 0.5881, "step": 374 }, { "epoch": 0.6590509666080844, "grad_norm": 1.4705324129969422, "learning_rate": 7.975627880009426e-05, "loss": 0.5845, "step": 375 }, { "epoch": 0.6608084358523726, "grad_norm": 0.9365657313678629, "learning_rate": 7.975083834538519e-05, "loss": 0.6039, "step": 376 }, { "epoch": 0.6625659050966608, "grad_norm": 1.1318601018661811, "learning_rate": 7.974533802663196e-05, "loss": 0.5936, "step": 377 }, { "epoch": 0.664323374340949, "grad_norm": 0.7785104139673413, "learning_rate": 7.973977785211794e-05, "loss": 0.5923, "step": 378 }, { "epoch": 0.6660808435852372, "grad_norm": 0.8992066734660278, "learning_rate": 7.973415783021668e-05, "loss": 0.5888, "step": 379 }, { "epoch": 0.6678383128295254, "grad_norm": 1.1300641165177292, "learning_rate": 7.972847796939179e-05, "loss": 0.5914, "step": 380 }, { "epoch": 0.6695957820738138, "grad_norm": 0.9781815570318672, "learning_rate": 7.972273827819706e-05, "loss": 0.5991, "step": 381 }, { "epoch": 0.671353251318102, "grad_norm": 1.5471171176656773, "learning_rate": 7.971693876527635e-05, "loss": 0.6049, "step": 382 }, { "epoch": 0.6731107205623902, "grad_norm": 1.060314131844292, "learning_rate": 7.971107943936364e-05, "loss": 0.5931, "step": 383 }, { "epoch": 0.6748681898066784, "grad_norm": 1.2411022588678755, "learning_rate": 7.970516030928296e-05, "loss": 0.5813, "step": 384 }, { "epoch": 0.6766256590509666, "grad_norm": 0.8480325859175541, "learning_rate": 7.969918138394841e-05, "loss": 0.5939, "step": 385 }, { "epoch": 0.6783831282952548, "grad_norm": 1.0606326265442998, "learning_rate": 7.969314267236414e-05, "loss": 0.5799, "step": 386 }, { "epoch": 0.680140597539543, "grad_norm": 0.8835475860295473, "learning_rate": 7.968704418362434e-05, "loss": 0.5869, "step": 387 }, { "epoch": 0.6818980667838312, "grad_norm": 1.3853825388394454, "learning_rate": 7.968088592691325e-05, "loss": 0.6109, "step": 388 }, { "epoch": 0.6836555360281195, "grad_norm": 1.0691636971360332, "learning_rate": 7.967466791150504e-05, "loss": 0.5832, "step": 389 }, { "epoch": 0.6854130052724078, "grad_norm": 1.103355068079978, "learning_rate": 7.966839014676399e-05, "loss": 0.5989, "step": 390 }, { "epoch": 0.687170474516696, "grad_norm": 1.2249828951672377, "learning_rate": 7.966205264214426e-05, "loss": 0.591, "step": 391 }, { "epoch": 0.6889279437609842, "grad_norm": 0.9748864393247428, "learning_rate": 7.965565540719002e-05, "loss": 0.579, "step": 392 }, { "epoch": 0.6906854130052724, "grad_norm": 1.0957927828072558, "learning_rate": 7.964919845153542e-05, "loss": 0.6099, "step": 393 }, { "epoch": 0.6924428822495606, "grad_norm": 1.2806858867176882, "learning_rate": 7.964268178490449e-05, "loss": 0.5931, "step": 394 }, { "epoch": 0.6942003514938488, "grad_norm": 0.8564545146869288, "learning_rate": 7.96361054171112e-05, "loss": 0.5904, "step": 395 }, { "epoch": 0.6959578207381371, "grad_norm": 1.0108705322283864, "learning_rate": 7.962946935805949e-05, "loss": 0.6032, "step": 396 }, { "epoch": 0.6977152899824253, "grad_norm": 1.2320825525698176, "learning_rate": 7.962277361774309e-05, "loss": 0.587, "step": 397 }, { "epoch": 0.6994727592267135, "grad_norm": 1.0049933635301735, "learning_rate": 7.96160182062457e-05, "loss": 0.5836, "step": 398 }, { "epoch": 0.7012302284710018, "grad_norm": 1.2470558878381448, "learning_rate": 7.960920313374084e-05, "loss": 0.6072, "step": 399 }, { "epoch": 0.70298769771529, "grad_norm": 0.9619865022311701, "learning_rate": 7.960232841049189e-05, "loss": 0.5881, "step": 400 }, { "epoch": 0.7047451669595782, "grad_norm": 1.0961515046437675, "learning_rate": 7.959539404685205e-05, "loss": 0.6008, "step": 401 }, { "epoch": 0.7065026362038664, "grad_norm": 1.1008021807464539, "learning_rate": 7.958840005326433e-05, "loss": 0.5991, "step": 402 }, { "epoch": 0.7082601054481547, "grad_norm": 1.4194115839071668, "learning_rate": 7.95813464402616e-05, "loss": 0.5782, "step": 403 }, { "epoch": 0.7100175746924429, "grad_norm": 0.622629594698381, "learning_rate": 7.957423321846645e-05, "loss": 0.5844, "step": 404 }, { "epoch": 0.7117750439367311, "grad_norm": 0.8611693028629404, "learning_rate": 7.956706039859124e-05, "loss": 0.5811, "step": 405 }, { "epoch": 0.7135325131810193, "grad_norm": 1.467138973629609, "learning_rate": 7.955982799143815e-05, "loss": 0.585, "step": 406 }, { "epoch": 0.7152899824253075, "grad_norm": 0.7216387114481122, "learning_rate": 7.955253600789902e-05, "loss": 0.5815, "step": 407 }, { "epoch": 0.7170474516695958, "grad_norm": 0.9412494301143117, "learning_rate": 7.954518445895549e-05, "loss": 0.6022, "step": 408 }, { "epoch": 0.718804920913884, "grad_norm": 0.98585615520726, "learning_rate": 7.95377733556788e-05, "loss": 0.5858, "step": 409 }, { "epoch": 0.7205623901581723, "grad_norm": 1.0009573014434388, "learning_rate": 7.953030270922999e-05, "loss": 0.5905, "step": 410 }, { "epoch": 0.7223198594024605, "grad_norm": 1.2948723891603537, "learning_rate": 7.952277253085968e-05, "loss": 0.593, "step": 411 }, { "epoch": 0.7240773286467487, "grad_norm": 1.0091667357069782, "learning_rate": 7.951518283190821e-05, "loss": 0.5801, "step": 412 }, { "epoch": 0.7258347978910369, "grad_norm": 1.2338519747011458, "learning_rate": 7.950753362380551e-05, "loss": 0.584, "step": 413 }, { "epoch": 0.7275922671353251, "grad_norm": 0.8404325175743319, "learning_rate": 7.949982491807117e-05, "loss": 0.5814, "step": 414 }, { "epoch": 0.7293497363796133, "grad_norm": 0.882136279902755, "learning_rate": 7.949205672631435e-05, "loss": 0.574, "step": 415 }, { "epoch": 0.7311072056239016, "grad_norm": 1.2153126129400027, "learning_rate": 7.948422906023378e-05, "loss": 0.5724, "step": 416 }, { "epoch": 0.7328646748681898, "grad_norm": 0.8518300609632989, "learning_rate": 7.947634193161784e-05, "loss": 0.5724, "step": 417 }, { "epoch": 0.7346221441124781, "grad_norm": 0.7314734238861504, "learning_rate": 7.946839535234436e-05, "loss": 0.5815, "step": 418 }, { "epoch": 0.7363796133567663, "grad_norm": 0.9450499676308454, "learning_rate": 7.946038933438076e-05, "loss": 0.5822, "step": 419 }, { "epoch": 0.7381370826010545, "grad_norm": 0.8564145752639226, "learning_rate": 7.945232388978395e-05, "loss": 0.5889, "step": 420 }, { "epoch": 0.7398945518453427, "grad_norm": 1.2385640494898753, "learning_rate": 7.944419903070035e-05, "loss": 0.5925, "step": 421 }, { "epoch": 0.7416520210896309, "grad_norm": 1.1556823459099599, "learning_rate": 7.943601476936585e-05, "loss": 0.5955, "step": 422 }, { "epoch": 0.7434094903339191, "grad_norm": 0.9221514603137378, "learning_rate": 7.942777111810581e-05, "loss": 0.58, "step": 423 }, { "epoch": 0.7451669595782073, "grad_norm": 1.222151672360337, "learning_rate": 7.941946808933501e-05, "loss": 0.5834, "step": 424 }, { "epoch": 0.7469244288224957, "grad_norm": 1.0030241813001006, "learning_rate": 7.941110569555766e-05, "loss": 0.5754, "step": 425 }, { "epoch": 0.7486818980667839, "grad_norm": 1.0027075024163319, "learning_rate": 7.940268394936737e-05, "loss": 0.5897, "step": 426 }, { "epoch": 0.7504393673110721, "grad_norm": 1.0269126969820612, "learning_rate": 7.939420286344714e-05, "loss": 0.5764, "step": 427 }, { "epoch": 0.7521968365553603, "grad_norm": 0.887384817455975, "learning_rate": 7.938566245056933e-05, "loss": 0.5982, "step": 428 }, { "epoch": 0.7539543057996485, "grad_norm": 1.05374613030807, "learning_rate": 7.937706272359567e-05, "loss": 0.5953, "step": 429 }, { "epoch": 0.7557117750439367, "grad_norm": 1.3638524551652944, "learning_rate": 7.936840369547717e-05, "loss": 0.5717, "step": 430 }, { "epoch": 0.7574692442882249, "grad_norm": 0.7032253317880081, "learning_rate": 7.935968537925417e-05, "loss": 0.5699, "step": 431 }, { "epoch": 0.7592267135325131, "grad_norm": 0.7185238345825404, "learning_rate": 7.93509077880563e-05, "loss": 0.5862, "step": 432 }, { "epoch": 0.7609841827768014, "grad_norm": 0.8600902186367494, "learning_rate": 7.934207093510246e-05, "loss": 0.5868, "step": 433 }, { "epoch": 0.7627416520210897, "grad_norm": 1.2942439482705042, "learning_rate": 7.933317483370079e-05, "loss": 0.5785, "step": 434 }, { "epoch": 0.7644991212653779, "grad_norm": 0.9983171685900692, "learning_rate": 7.932421949724867e-05, "loss": 0.6038, "step": 435 }, { "epoch": 0.7662565905096661, "grad_norm": 1.1402226882222044, "learning_rate": 7.931520493923263e-05, "loss": 0.577, "step": 436 }, { "epoch": 0.7680140597539543, "grad_norm": 1.02135544351546, "learning_rate": 7.930613117322848e-05, "loss": 0.5731, "step": 437 }, { "epoch": 0.7697715289982425, "grad_norm": 1.2563904343562338, "learning_rate": 7.929699821290111e-05, "loss": 0.5868, "step": 438 }, { "epoch": 0.7715289982425307, "grad_norm": 0.8168256537257951, "learning_rate": 7.928780607200462e-05, "loss": 0.5837, "step": 439 }, { "epoch": 0.773286467486819, "grad_norm": 0.6678107481048496, "learning_rate": 7.927855476438221e-05, "loss": 0.5865, "step": 440 }, { "epoch": 0.7750439367311072, "grad_norm": 0.8822559803451516, "learning_rate": 7.926924430396618e-05, "loss": 0.5877, "step": 441 }, { "epoch": 0.7768014059753954, "grad_norm": 1.0732252527525405, "learning_rate": 7.925987470477788e-05, "loss": 0.5895, "step": 442 }, { "epoch": 0.7785588752196837, "grad_norm": 1.012005826014819, "learning_rate": 7.925044598092781e-05, "loss": 0.5826, "step": 443 }, { "epoch": 0.7803163444639719, "grad_norm": 1.1378286902255472, "learning_rate": 7.924095814661541e-05, "loss": 0.5798, "step": 444 }, { "epoch": 0.7820738137082601, "grad_norm": 1.0831601246816183, "learning_rate": 7.923141121612922e-05, "loss": 0.5938, "step": 445 }, { "epoch": 0.7838312829525483, "grad_norm": 0.9412322521907137, "learning_rate": 7.922180520384673e-05, "loss": 0.5862, "step": 446 }, { "epoch": 0.7855887521968365, "grad_norm": 0.816996046099294, "learning_rate": 7.921214012423443e-05, "loss": 0.5707, "step": 447 }, { "epoch": 0.7873462214411248, "grad_norm": 1.0033807276379363, "learning_rate": 7.920241599184776e-05, "loss": 0.6059, "step": 448 }, { "epoch": 0.789103690685413, "grad_norm": 1.1853769887414487, "learning_rate": 7.919263282133106e-05, "loss": 0.5913, "step": 449 }, { "epoch": 0.7908611599297012, "grad_norm": 0.8992893991730242, "learning_rate": 7.918279062741762e-05, "loss": 0.5661, "step": 450 }, { "epoch": 0.7926186291739895, "grad_norm": 1.0094843398982878, "learning_rate": 7.917288942492964e-05, "loss": 0.579, "step": 451 }, { "epoch": 0.7943760984182777, "grad_norm": 0.9795242377581999, "learning_rate": 7.91629292287781e-05, "loss": 0.5722, "step": 452 }, { "epoch": 0.7961335676625659, "grad_norm": 1.042618773087428, "learning_rate": 7.91529100539629e-05, "loss": 0.5907, "step": 453 }, { "epoch": 0.7978910369068541, "grad_norm": 0.9462520274380485, "learning_rate": 7.914283191557273e-05, "loss": 0.5824, "step": 454 }, { "epoch": 0.7996485061511424, "grad_norm": 1.0527242651313253, "learning_rate": 7.913269482878511e-05, "loss": 0.5833, "step": 455 }, { "epoch": 0.8014059753954306, "grad_norm": 1.1992046674698915, "learning_rate": 7.912249880886627e-05, "loss": 0.5725, "step": 456 }, { "epoch": 0.8031634446397188, "grad_norm": 0.7286598695951295, "learning_rate": 7.911224387117127e-05, "loss": 0.5924, "step": 457 }, { "epoch": 0.804920913884007, "grad_norm": 0.8366654652944925, "learning_rate": 7.910193003114384e-05, "loss": 0.5726, "step": 458 }, { "epoch": 0.8066783831282952, "grad_norm": 0.7760461929862912, "learning_rate": 7.909155730431642e-05, "loss": 0.5803, "step": 459 }, { "epoch": 0.8084358523725835, "grad_norm": 0.9011897795945965, "learning_rate": 7.908112570631017e-05, "loss": 0.591, "step": 460 }, { "epoch": 0.8101933216168717, "grad_norm": 1.213855339514183, "learning_rate": 7.90706352528349e-05, "loss": 0.5844, "step": 461 }, { "epoch": 0.81195079086116, "grad_norm": 0.8785871783651735, "learning_rate": 7.906008595968904e-05, "loss": 0.5743, "step": 462 }, { "epoch": 0.8137082601054482, "grad_norm": 0.7341766226925444, "learning_rate": 7.904947784275959e-05, "loss": 0.5805, "step": 463 }, { "epoch": 0.8154657293497364, "grad_norm": 0.5901508625817597, "learning_rate": 7.903881091802224e-05, "loss": 0.5969, "step": 464 }, { "epoch": 0.8172231985940246, "grad_norm": 0.6179613072286362, "learning_rate": 7.902808520154115e-05, "loss": 0.5792, "step": 465 }, { "epoch": 0.8189806678383128, "grad_norm": 0.708394905719722, "learning_rate": 7.901730070946906e-05, "loss": 0.5731, "step": 466 }, { "epoch": 0.820738137082601, "grad_norm": 0.9881477502176157, "learning_rate": 7.900645745804724e-05, "loss": 0.5865, "step": 467 }, { "epoch": 0.8224956063268892, "grad_norm": 1.1380451422320528, "learning_rate": 7.899555546360542e-05, "loss": 0.5918, "step": 468 }, { "epoch": 0.8242530755711776, "grad_norm": 0.8170358685394246, "learning_rate": 7.89845947425618e-05, "loss": 0.5858, "step": 469 }, { "epoch": 0.8260105448154658, "grad_norm": 1.161135283512483, "learning_rate": 7.897357531142303e-05, "loss": 0.5787, "step": 470 }, { "epoch": 0.827768014059754, "grad_norm": 1.3961290033437996, "learning_rate": 7.89624971867842e-05, "loss": 0.5781, "step": 471 }, { "epoch": 0.8295254833040422, "grad_norm": 0.7274440653753689, "learning_rate": 7.895136038532871e-05, "loss": 0.5806, "step": 472 }, { "epoch": 0.8312829525483304, "grad_norm": 0.9558313764477302, "learning_rate": 7.894016492382843e-05, "loss": 0.5951, "step": 473 }, { "epoch": 0.8330404217926186, "grad_norm": 1.2894075126340427, "learning_rate": 7.89289108191435e-05, "loss": 0.5911, "step": 474 }, { "epoch": 0.8347978910369068, "grad_norm": 1.1160599671554639, "learning_rate": 7.891759808822241e-05, "loss": 0.5925, "step": 475 }, { "epoch": 0.836555360281195, "grad_norm": 0.9857142642995558, "learning_rate": 7.890622674810192e-05, "loss": 0.5748, "step": 476 }, { "epoch": 0.8383128295254832, "grad_norm": 0.8131455301776392, "learning_rate": 7.889479681590707e-05, "loss": 0.578, "step": 477 }, { "epoch": 0.8400702987697716, "grad_norm": 0.9587748694295426, "learning_rate": 7.888330830885113e-05, "loss": 0.5738, "step": 478 }, { "epoch": 0.8418277680140598, "grad_norm": 1.1295305172712473, "learning_rate": 7.887176124423557e-05, "loss": 0.5657, "step": 479 }, { "epoch": 0.843585237258348, "grad_norm": 1.0007209045239542, "learning_rate": 7.886015563945007e-05, "loss": 0.5846, "step": 480 }, { "epoch": 0.8453427065026362, "grad_norm": 1.076005965262866, "learning_rate": 7.884849151197248e-05, "loss": 0.585, "step": 481 }, { "epoch": 0.8471001757469244, "grad_norm": 0.833451935648409, "learning_rate": 7.883676887936873e-05, "loss": 0.5698, "step": 482 }, { "epoch": 0.8488576449912126, "grad_norm": 0.9340587068836048, "learning_rate": 7.882498775929293e-05, "loss": 0.5813, "step": 483 }, { "epoch": 0.8506151142355008, "grad_norm": 0.9472210448164372, "learning_rate": 7.881314816948721e-05, "loss": 0.5827, "step": 484 }, { "epoch": 0.8523725834797891, "grad_norm": 1.0540178263793332, "learning_rate": 7.880125012778177e-05, "loss": 0.5634, "step": 485 }, { "epoch": 0.8541300527240774, "grad_norm": 1.0248531445832618, "learning_rate": 7.878929365209486e-05, "loss": 0.5826, "step": 486 }, { "epoch": 0.8558875219683656, "grad_norm": 0.9373741622984061, "learning_rate": 7.877727876043273e-05, "loss": 0.5614, "step": 487 }, { "epoch": 0.8576449912126538, "grad_norm": 0.8411750972765034, "learning_rate": 7.876520547088955e-05, "loss": 0.5759, "step": 488 }, { "epoch": 0.859402460456942, "grad_norm": 0.8296479216517884, "learning_rate": 7.875307380164753e-05, "loss": 0.5897, "step": 489 }, { "epoch": 0.8611599297012302, "grad_norm": 0.8710252185294417, "learning_rate": 7.87408837709767e-05, "loss": 0.5929, "step": 490 }, { "epoch": 0.8629173989455184, "grad_norm": 1.007543941008589, "learning_rate": 7.872863539723504e-05, "loss": 0.5712, "step": 491 }, { "epoch": 0.8646748681898067, "grad_norm": 0.9991734955601097, "learning_rate": 7.871632869886839e-05, "loss": 0.5676, "step": 492 }, { "epoch": 0.8664323374340949, "grad_norm": 0.949179028325099, "learning_rate": 7.87039636944104e-05, "loss": 0.5784, "step": 493 }, { "epoch": 0.8681898066783831, "grad_norm": 0.8716155876516686, "learning_rate": 7.869154040248255e-05, "loss": 0.5802, "step": 494 }, { "epoch": 0.8699472759226714, "grad_norm": 0.8331483422516651, "learning_rate": 7.867905884179409e-05, "loss": 0.572, "step": 495 }, { "epoch": 0.8717047451669596, "grad_norm": 0.7459081106876493, "learning_rate": 7.866651903114204e-05, "loss": 0.5759, "step": 496 }, { "epoch": 0.8734622144112478, "grad_norm": 0.9643018258460383, "learning_rate": 7.86539209894111e-05, "loss": 0.5816, "step": 497 }, { "epoch": 0.875219683655536, "grad_norm": 1.08309439830322, "learning_rate": 7.864126473557371e-05, "loss": 0.5833, "step": 498 }, { "epoch": 0.8769771528998243, "grad_norm": 0.7466056627348394, "learning_rate": 7.862855028868996e-05, "loss": 0.5754, "step": 499 }, { "epoch": 0.8787346221441125, "grad_norm": 0.6699192001248018, "learning_rate": 7.861577766790759e-05, "loss": 0.5651, "step": 500 }, { "epoch": 0.8804920913884007, "grad_norm": 0.6186435350078151, "learning_rate": 7.86029468924619e-05, "loss": 0.57, "step": 501 }, { "epoch": 0.8822495606326889, "grad_norm": 0.5638019453082576, "learning_rate": 7.859005798167583e-05, "loss": 0.5752, "step": 502 }, { "epoch": 0.8840070298769771, "grad_norm": 0.6937625124662558, "learning_rate": 7.857711095495986e-05, "loss": 0.5605, "step": 503 }, { "epoch": 0.8857644991212654, "grad_norm": 0.9461181594164347, "learning_rate": 7.856410583181194e-05, "loss": 0.5993, "step": 504 }, { "epoch": 0.8875219683655536, "grad_norm": 0.9823969547013909, "learning_rate": 7.85510426318176e-05, "loss": 0.565, "step": 505 }, { "epoch": 0.8892794376098418, "grad_norm": 1.0114740799238333, "learning_rate": 7.853792137464975e-05, "loss": 0.5778, "step": 506 }, { "epoch": 0.8910369068541301, "grad_norm": 1.2225833198749858, "learning_rate": 7.852474208006875e-05, "loss": 0.5686, "step": 507 }, { "epoch": 0.8927943760984183, "grad_norm": 0.8866587975578384, "learning_rate": 7.851150476792243e-05, "loss": 0.5781, "step": 508 }, { "epoch": 0.8945518453427065, "grad_norm": 0.8982726438474208, "learning_rate": 7.849820945814589e-05, "loss": 0.5888, "step": 509 }, { "epoch": 0.8963093145869947, "grad_norm": 0.9409396355590688, "learning_rate": 7.848485617076166e-05, "loss": 0.5622, "step": 510 }, { "epoch": 0.8980667838312829, "grad_norm": 1.306866200866147, "learning_rate": 7.847144492587955e-05, "loss": 0.5961, "step": 511 }, { "epoch": 0.8998242530755711, "grad_norm": 0.8465613279967831, "learning_rate": 7.845797574369664e-05, "loss": 0.5826, "step": 512 }, { "epoch": 0.9015817223198594, "grad_norm": 0.7239666183305903, "learning_rate": 7.844444864449728e-05, "loss": 0.5669, "step": 513 }, { "epoch": 0.9033391915641477, "grad_norm": 0.6935759667080602, "learning_rate": 7.843086364865302e-05, "loss": 0.5862, "step": 514 }, { "epoch": 0.9050966608084359, "grad_norm": 1.0368579997410807, "learning_rate": 7.841722077662264e-05, "loss": 0.5774, "step": 515 }, { "epoch": 0.9068541300527241, "grad_norm": 1.2089923731841827, "learning_rate": 7.840352004895205e-05, "loss": 0.5818, "step": 516 }, { "epoch": 0.9086115992970123, "grad_norm": 0.7211598533374914, "learning_rate": 7.83897614862743e-05, "loss": 0.5791, "step": 517 }, { "epoch": 0.9103690685413005, "grad_norm": 1.0397820302800982, "learning_rate": 7.837594510930955e-05, "loss": 0.5902, "step": 518 }, { "epoch": 0.9121265377855887, "grad_norm": 0.9400001921012733, "learning_rate": 7.836207093886498e-05, "loss": 0.5663, "step": 519 }, { "epoch": 0.9138840070298769, "grad_norm": 0.7858966629844719, "learning_rate": 7.834813899583486e-05, "loss": 0.5706, "step": 520 }, { "epoch": 0.9156414762741653, "grad_norm": 0.6406819804404252, "learning_rate": 7.833414930120047e-05, "loss": 0.5578, "step": 521 }, { "epoch": 0.9173989455184535, "grad_norm": 0.49662012232828867, "learning_rate": 7.832010187602999e-05, "loss": 0.5729, "step": 522 }, { "epoch": 0.9191564147627417, "grad_norm": 0.7082285571991453, "learning_rate": 7.830599674147861e-05, "loss": 0.5688, "step": 523 }, { "epoch": 0.9209138840070299, "grad_norm": 0.7536860187879152, "learning_rate": 7.829183391878842e-05, "loss": 0.5711, "step": 524 }, { "epoch": 0.9226713532513181, "grad_norm": 0.6483870351462157, "learning_rate": 7.827761342928836e-05, "loss": 0.5574, "step": 525 }, { "epoch": 0.9244288224956063, "grad_norm": 0.6953355565734405, "learning_rate": 7.826333529439422e-05, "loss": 0.5613, "step": 526 }, { "epoch": 0.9261862917398945, "grad_norm": 0.9194515997724495, "learning_rate": 7.824899953560865e-05, "loss": 0.5602, "step": 527 }, { "epoch": 0.9279437609841827, "grad_norm": 1.3315592220341839, "learning_rate": 7.823460617452102e-05, "loss": 0.5744, "step": 528 }, { "epoch": 0.929701230228471, "grad_norm": 0.823304098648789, "learning_rate": 7.822015523280746e-05, "loss": 0.5675, "step": 529 }, { "epoch": 0.9314586994727593, "grad_norm": 0.6003195481242134, "learning_rate": 7.820564673223084e-05, "loss": 0.5695, "step": 530 }, { "epoch": 0.9332161687170475, "grad_norm": 0.6988887097505538, "learning_rate": 7.819108069464069e-05, "loss": 0.5813, "step": 531 }, { "epoch": 0.9349736379613357, "grad_norm": 0.9803624620232468, "learning_rate": 7.817645714197322e-05, "loss": 0.5867, "step": 532 }, { "epoch": 0.9367311072056239, "grad_norm": 1.2240968847603904, "learning_rate": 7.816177609625123e-05, "loss": 0.573, "step": 533 }, { "epoch": 0.9384885764499121, "grad_norm": 0.6134759973788723, "learning_rate": 7.81470375795841e-05, "loss": 0.5702, "step": 534 }, { "epoch": 0.9402460456942003, "grad_norm": 0.6353570788496529, "learning_rate": 7.813224161416775e-05, "loss": 0.5953, "step": 535 }, { "epoch": 0.9420035149384886, "grad_norm": 1.0181717273277242, "learning_rate": 7.81173882222847e-05, "loss": 0.5581, "step": 536 }, { "epoch": 0.9437609841827768, "grad_norm": 0.9954275064310115, "learning_rate": 7.810247742630382e-05, "loss": 0.5761, "step": 537 }, { "epoch": 0.945518453427065, "grad_norm": 0.8250840564907528, "learning_rate": 7.808750924868054e-05, "loss": 0.5817, "step": 538 }, { "epoch": 0.9472759226713533, "grad_norm": 0.7923052951138019, "learning_rate": 7.807248371195665e-05, "loss": 0.5861, "step": 539 }, { "epoch": 0.9490333919156415, "grad_norm": 1.3442589385217618, "learning_rate": 7.805740083876034e-05, "loss": 0.5659, "step": 540 }, { "epoch": 0.9507908611599297, "grad_norm": 0.8672419053705664, "learning_rate": 7.804226065180615e-05, "loss": 0.5775, "step": 541 }, { "epoch": 0.9525483304042179, "grad_norm": 0.8717685239807329, "learning_rate": 7.802706317389492e-05, "loss": 0.5899, "step": 542 }, { "epoch": 0.9543057996485061, "grad_norm": 0.9798449505287722, "learning_rate": 7.801180842791379e-05, "loss": 0.604, "step": 543 }, { "epoch": 0.9560632688927944, "grad_norm": 1.0165445499147476, "learning_rate": 7.79964964368361e-05, "loss": 0.5797, "step": 544 }, { "epoch": 0.9578207381370826, "grad_norm": 0.942553001449059, "learning_rate": 7.798112722372148e-05, "loss": 0.566, "step": 545 }, { "epoch": 0.9595782073813708, "grad_norm": 0.6890450405150966, "learning_rate": 7.796570081171564e-05, "loss": 0.5836, "step": 546 }, { "epoch": 0.961335676625659, "grad_norm": 0.740410748556896, "learning_rate": 7.79502172240505e-05, "loss": 0.5722, "step": 547 }, { "epoch": 0.9630931458699473, "grad_norm": 0.9489787931043174, "learning_rate": 7.793467648404408e-05, "loss": 0.5748, "step": 548 }, { "epoch": 0.9648506151142355, "grad_norm": 0.79986969984351, "learning_rate": 7.791907861510043e-05, "loss": 0.5906, "step": 549 }, { "epoch": 0.9666080843585237, "grad_norm": 0.7529565634493184, "learning_rate": 7.790342364070965e-05, "loss": 0.5815, "step": 550 }, { "epoch": 0.968365553602812, "grad_norm": 0.8591334558978408, "learning_rate": 7.788771158444787e-05, "loss": 0.557, "step": 551 }, { "epoch": 0.9701230228471002, "grad_norm": 1.2233226411990183, "learning_rate": 7.787194246997717e-05, "loss": 0.5877, "step": 552 }, { "epoch": 0.9718804920913884, "grad_norm": 1.0145770804101029, "learning_rate": 7.785611632104552e-05, "loss": 0.5583, "step": 553 }, { "epoch": 0.9736379613356766, "grad_norm": 0.9986119327857896, "learning_rate": 7.784023316148684e-05, "loss": 0.572, "step": 554 }, { "epoch": 0.9753954305799648, "grad_norm": 0.8507041878503755, "learning_rate": 7.782429301522086e-05, "loss": 0.565, "step": 555 }, { "epoch": 0.9771528998242531, "grad_norm": 0.730671638610257, "learning_rate": 7.780829590625317e-05, "loss": 0.5766, "step": 556 }, { "epoch": 0.9789103690685413, "grad_norm": 0.794201927247504, "learning_rate": 7.779224185867513e-05, "loss": 0.5784, "step": 557 }, { "epoch": 0.9806678383128296, "grad_norm": 0.7568582001189176, "learning_rate": 7.777613089666382e-05, "loss": 0.5715, "step": 558 }, { "epoch": 0.9824253075571178, "grad_norm": 0.7301169396728986, "learning_rate": 7.775996304448209e-05, "loss": 0.5637, "step": 559 }, { "epoch": 0.984182776801406, "grad_norm": 0.9293706499455944, "learning_rate": 7.774373832647842e-05, "loss": 0.5644, "step": 560 }, { "epoch": 0.9859402460456942, "grad_norm": 1.0576136842975858, "learning_rate": 7.772745676708693e-05, "loss": 0.5676, "step": 561 }, { "epoch": 0.9876977152899824, "grad_norm": 1.082267531838569, "learning_rate": 7.771111839082738e-05, "loss": 0.5725, "step": 562 }, { "epoch": 0.9894551845342706, "grad_norm": 0.9619439431853406, "learning_rate": 7.769472322230503e-05, "loss": 0.5763, "step": 563 }, { "epoch": 0.9912126537785588, "grad_norm": 0.9272717158607172, "learning_rate": 7.767827128621076e-05, "loss": 0.5679, "step": 564 }, { "epoch": 0.9929701230228472, "grad_norm": 0.9948719621250162, "learning_rate": 7.766176260732084e-05, "loss": 0.5806, "step": 565 }, { "epoch": 0.9947275922671354, "grad_norm": 0.9847182179853623, "learning_rate": 7.764519721049706e-05, "loss": 0.5658, "step": 566 }, { "epoch": 0.9964850615114236, "grad_norm": 0.8960644190355812, "learning_rate": 7.762857512068663e-05, "loss": 0.5547, "step": 567 }, { "epoch": 0.9982425307557118, "grad_norm": 0.8465805149640533, "learning_rate": 7.761189636292206e-05, "loss": 0.5789, "step": 568 }, { "epoch": 1.0, "grad_norm": 0.7876829733782373, "learning_rate": 7.759516096232133e-05, "loss": 0.5601, "step": 569 }, { "epoch": 1.0017574692442883, "grad_norm": 0.7059093076702422, "learning_rate": 7.75783689440876e-05, "loss": 0.5569, "step": 570 }, { "epoch": 1.0035149384885764, "grad_norm": 0.6830821456277814, "learning_rate": 7.756152033350936e-05, "loss": 0.5554, "step": 571 }, { "epoch": 1.0052724077328647, "grad_norm": 0.8866485578909205, "learning_rate": 7.754461515596034e-05, "loss": 0.5436, "step": 572 }, { "epoch": 1.0070298769771528, "grad_norm": 1.0455026135051642, "learning_rate": 7.752765343689938e-05, "loss": 0.5616, "step": 573 }, { "epoch": 1.0087873462214412, "grad_norm": 1.0477879606347331, "learning_rate": 7.751063520187055e-05, "loss": 0.5665, "step": 574 }, { "epoch": 1.0105448154657293, "grad_norm": 1.0618392065376234, "learning_rate": 7.7493560476503e-05, "loss": 0.5535, "step": 575 }, { "epoch": 1.0123022847100176, "grad_norm": 0.9324648165315635, "learning_rate": 7.747642928651098e-05, "loss": 0.5652, "step": 576 }, { "epoch": 1.0140597539543057, "grad_norm": 0.7486108329961071, "learning_rate": 7.745924165769371e-05, "loss": 0.56, "step": 577 }, { "epoch": 1.015817223198594, "grad_norm": 0.6042401528956576, "learning_rate": 7.744199761593549e-05, "loss": 0.5466, "step": 578 }, { "epoch": 1.0175746924428823, "grad_norm": 0.5247204423157633, "learning_rate": 7.74246971872055e-05, "loss": 0.561, "step": 579 }, { "epoch": 1.0193321616871704, "grad_norm": 0.5246778752209503, "learning_rate": 7.74073403975579e-05, "loss": 0.5696, "step": 580 }, { "epoch": 1.0210896309314588, "grad_norm": 0.5457277605355756, "learning_rate": 7.73899272731317e-05, "loss": 0.5527, "step": 581 }, { "epoch": 1.0228471001757469, "grad_norm": 0.5465329848363112, "learning_rate": 7.737245784015074e-05, "loss": 0.5501, "step": 582 }, { "epoch": 1.0246045694200352, "grad_norm": 0.6382809968512181, "learning_rate": 7.735493212492367e-05, "loss": 0.5471, "step": 583 }, { "epoch": 1.0263620386643233, "grad_norm": 0.8169457659237395, "learning_rate": 7.73373501538439e-05, "loss": 0.5485, "step": 584 }, { "epoch": 1.0281195079086116, "grad_norm": 0.8123525539284098, "learning_rate": 7.731971195338956e-05, "loss": 0.5501, "step": 585 }, { "epoch": 1.0298769771528997, "grad_norm": 0.752617141084679, "learning_rate": 7.730201755012348e-05, "loss": 0.5533, "step": 586 }, { "epoch": 1.031634446397188, "grad_norm": 0.6572276738356119, "learning_rate": 7.728426697069308e-05, "loss": 0.5493, "step": 587 }, { "epoch": 1.0333919156414764, "grad_norm": 0.6388072827407837, "learning_rate": 7.726646024183043e-05, "loss": 0.5551, "step": 588 }, { "epoch": 1.0351493848857645, "grad_norm": 0.7336728051643843, "learning_rate": 7.724859739035214e-05, "loss": 0.553, "step": 589 }, { "epoch": 1.0369068541300528, "grad_norm": 0.7916528919493416, "learning_rate": 7.723067844315935e-05, "loss": 0.5528, "step": 590 }, { "epoch": 1.038664323374341, "grad_norm": 0.9121754482793986, "learning_rate": 7.721270342723767e-05, "loss": 0.557, "step": 591 }, { "epoch": 1.0404217926186292, "grad_norm": 1.1188571472464948, "learning_rate": 7.719467236965712e-05, "loss": 0.5575, "step": 592 }, { "epoch": 1.0421792618629173, "grad_norm": 1.0332297038029896, "learning_rate": 7.717658529757219e-05, "loss": 0.5402, "step": 593 }, { "epoch": 1.0439367311072056, "grad_norm": 0.8861857818024235, "learning_rate": 7.715844223822165e-05, "loss": 0.5308, "step": 594 }, { "epoch": 1.0456942003514937, "grad_norm": 0.8331356531609116, "learning_rate": 7.714024321892864e-05, "loss": 0.5563, "step": 595 }, { "epoch": 1.047451669595782, "grad_norm": 0.7977160549616331, "learning_rate": 7.712198826710053e-05, "loss": 0.5472, "step": 596 }, { "epoch": 1.0492091388400704, "grad_norm": 0.7944773874980271, "learning_rate": 7.710367741022897e-05, "loss": 0.5552, "step": 597 }, { "epoch": 1.0509666080843585, "grad_norm": 0.8604135474861583, "learning_rate": 7.708531067588977e-05, "loss": 0.5563, "step": 598 }, { "epoch": 1.0527240773286468, "grad_norm": 0.8830070855596014, "learning_rate": 7.706688809174291e-05, "loss": 0.5568, "step": 599 }, { "epoch": 1.054481546572935, "grad_norm": 0.7588216506792758, "learning_rate": 7.704840968553246e-05, "loss": 0.544, "step": 600 }, { "epoch": 1.0562390158172232, "grad_norm": 0.5129453228696151, "learning_rate": 7.702987548508656e-05, "loss": 0.5461, "step": 601 }, { "epoch": 1.0579964850615113, "grad_norm": 0.4091023239099563, "learning_rate": 7.70112855183174e-05, "loss": 0.5496, "step": 602 }, { "epoch": 1.0597539543057997, "grad_norm": 0.4504403125587712, "learning_rate": 7.699263981322112e-05, "loss": 0.5606, "step": 603 }, { "epoch": 1.0615114235500878, "grad_norm": 0.5154032829022808, "learning_rate": 7.697393839787782e-05, "loss": 0.5647, "step": 604 }, { "epoch": 1.063268892794376, "grad_norm": 0.6936966550455351, "learning_rate": 7.695518130045147e-05, "loss": 0.542, "step": 605 }, { "epoch": 1.0650263620386644, "grad_norm": 0.9839539777479138, "learning_rate": 7.693636854918997e-05, "loss": 0.5542, "step": 606 }, { "epoch": 1.0667838312829525, "grad_norm": 1.2975973918983685, "learning_rate": 7.691750017242493e-05, "loss": 0.5397, "step": 607 }, { "epoch": 1.0685413005272408, "grad_norm": 0.5905498633134657, "learning_rate": 7.689857619857181e-05, "loss": 0.5576, "step": 608 }, { "epoch": 1.070298769771529, "grad_norm": 0.48685943963160555, "learning_rate": 7.687959665612978e-05, "loss": 0.5694, "step": 609 }, { "epoch": 1.0720562390158173, "grad_norm": 0.8354075388617355, "learning_rate": 7.686056157368169e-05, "loss": 0.5522, "step": 610 }, { "epoch": 1.0738137082601054, "grad_norm": 1.1495669254425478, "learning_rate": 7.684147097989399e-05, "loss": 0.5473, "step": 611 }, { "epoch": 1.0755711775043937, "grad_norm": 0.8906924711668216, "learning_rate": 7.682232490351683e-05, "loss": 0.5416, "step": 612 }, { "epoch": 1.0773286467486818, "grad_norm": 0.7031828233178891, "learning_rate": 7.68031233733838e-05, "loss": 0.5604, "step": 613 }, { "epoch": 1.07908611599297, "grad_norm": 0.5749225291975912, "learning_rate": 7.678386641841209e-05, "loss": 0.5554, "step": 614 }, { "epoch": 1.0808435852372584, "grad_norm": 0.6098219481523796, "learning_rate": 7.676455406760232e-05, "loss": 0.5529, "step": 615 }, { "epoch": 1.0826010544815465, "grad_norm": 0.6977573422749664, "learning_rate": 7.674518635003853e-05, "loss": 0.539, "step": 616 }, { "epoch": 1.0843585237258349, "grad_norm": 0.6949165651771545, "learning_rate": 7.672576329488817e-05, "loss": 0.5566, "step": 617 }, { "epoch": 1.086115992970123, "grad_norm": 0.7303232611845862, "learning_rate": 7.670628493140198e-05, "loss": 0.5551, "step": 618 }, { "epoch": 1.0878734622144113, "grad_norm": 0.7962905843966125, "learning_rate": 7.668675128891407e-05, "loss": 0.5481, "step": 619 }, { "epoch": 1.0896309314586994, "grad_norm": 0.8447388958018496, "learning_rate": 7.66671623968417e-05, "loss": 0.5399, "step": 620 }, { "epoch": 1.0913884007029877, "grad_norm": 0.881261826007946, "learning_rate": 7.664751828468545e-05, "loss": 0.5468, "step": 621 }, { "epoch": 1.0931458699472758, "grad_norm": 1.0214398075369826, "learning_rate": 7.662781898202893e-05, "loss": 0.54, "step": 622 }, { "epoch": 1.0949033391915641, "grad_norm": 1.0840077855465318, "learning_rate": 7.660806451853899e-05, "loss": 0.5464, "step": 623 }, { "epoch": 1.0966608084358525, "grad_norm": 0.8164984810844005, "learning_rate": 7.658825492396546e-05, "loss": 0.556, "step": 624 }, { "epoch": 1.0984182776801406, "grad_norm": 0.5096417811989646, "learning_rate": 7.656839022814124e-05, "loss": 0.553, "step": 625 }, { "epoch": 1.1001757469244289, "grad_norm": 0.6134926559497319, "learning_rate": 7.65484704609822e-05, "loss": 0.5487, "step": 626 }, { "epoch": 1.101933216168717, "grad_norm": 0.923748307012472, "learning_rate": 7.652849565248716e-05, "loss": 0.5576, "step": 627 }, { "epoch": 1.1036906854130053, "grad_norm": 1.068251027955442, "learning_rate": 7.650846583273781e-05, "loss": 0.5581, "step": 628 }, { "epoch": 1.1054481546572934, "grad_norm": 1.0292119515906248, "learning_rate": 7.64883810318987e-05, "loss": 0.5676, "step": 629 }, { "epoch": 1.1072056239015817, "grad_norm": 0.9256728412080364, "learning_rate": 7.64682412802172e-05, "loss": 0.5538, "step": 630 }, { "epoch": 1.10896309314587, "grad_norm": 0.7755948974493021, "learning_rate": 7.644804660802337e-05, "loss": 0.5504, "step": 631 }, { "epoch": 1.1107205623901582, "grad_norm": 0.7142340565291054, "learning_rate": 7.642779704573005e-05, "loss": 0.552, "step": 632 }, { "epoch": 1.1124780316344465, "grad_norm": 0.6991518822524406, "learning_rate": 7.640749262383272e-05, "loss": 0.5476, "step": 633 }, { "epoch": 1.1142355008787346, "grad_norm": 0.7006452044396446, "learning_rate": 7.638713337290945e-05, "loss": 0.5509, "step": 634 }, { "epoch": 1.115992970123023, "grad_norm": 0.5717525209261332, "learning_rate": 7.63667193236209e-05, "loss": 0.5484, "step": 635 }, { "epoch": 1.117750439367311, "grad_norm": 0.45777162997876036, "learning_rate": 7.634625050671028e-05, "loss": 0.5515, "step": 636 }, { "epoch": 1.1195079086115993, "grad_norm": 0.6672108061443839, "learning_rate": 7.632572695300326e-05, "loss": 0.5513, "step": 637 }, { "epoch": 1.1212653778558874, "grad_norm": 0.7697835406218321, "learning_rate": 7.630514869340792e-05, "loss": 0.5379, "step": 638 }, { "epoch": 1.1230228471001757, "grad_norm": 0.7338900634691895, "learning_rate": 7.628451575891477e-05, "loss": 0.5563, "step": 639 }, { "epoch": 1.124780316344464, "grad_norm": 0.8353397415020025, "learning_rate": 7.626382818059662e-05, "loss": 0.5598, "step": 640 }, { "epoch": 1.1265377855887522, "grad_norm": 0.9637827289972132, "learning_rate": 7.624308598960859e-05, "loss": 0.5531, "step": 641 }, { "epoch": 1.1282952548330405, "grad_norm": 0.9878858627038484, "learning_rate": 7.622228921718807e-05, "loss": 0.5514, "step": 642 }, { "epoch": 1.1300527240773286, "grad_norm": 0.9967954505122217, "learning_rate": 7.620143789465458e-05, "loss": 0.5533, "step": 643 }, { "epoch": 1.131810193321617, "grad_norm": 1.0576905435854715, "learning_rate": 7.618053205340987e-05, "loss": 0.5615, "step": 644 }, { "epoch": 1.133567662565905, "grad_norm": 0.7437715605531593, "learning_rate": 7.615957172493774e-05, "loss": 0.5559, "step": 645 }, { "epoch": 1.1353251318101933, "grad_norm": 0.5221614261489316, "learning_rate": 7.613855694080407e-05, "loss": 0.5538, "step": 646 }, { "epoch": 1.1370826010544817, "grad_norm": 0.5325774984277896, "learning_rate": 7.611748773265677e-05, "loss": 0.5401, "step": 647 }, { "epoch": 1.1388400702987698, "grad_norm": 0.6615330716743749, "learning_rate": 7.609636413222564e-05, "loss": 0.5447, "step": 648 }, { "epoch": 1.140597539543058, "grad_norm": 0.7663860455354153, "learning_rate": 7.607518617132246e-05, "loss": 0.5532, "step": 649 }, { "epoch": 1.1423550087873462, "grad_norm": 0.9799109583297303, "learning_rate": 7.605395388184089e-05, "loss": 0.5444, "step": 650 }, { "epoch": 1.1441124780316345, "grad_norm": 1.1775503917673362, "learning_rate": 7.603266729575632e-05, "loss": 0.5502, "step": 651 }, { "epoch": 1.1458699472759226, "grad_norm": 0.6719065828717583, "learning_rate": 7.6011326445126e-05, "loss": 0.5505, "step": 652 }, { "epoch": 1.147627416520211, "grad_norm": 0.5031011953772502, "learning_rate": 7.598993136208887e-05, "loss": 0.5423, "step": 653 }, { "epoch": 1.149384885764499, "grad_norm": 0.8202333990599481, "learning_rate": 7.59684820788655e-05, "loss": 0.5487, "step": 654 }, { "epoch": 1.1511423550087874, "grad_norm": 1.0403968704005075, "learning_rate": 7.594697862775816e-05, "loss": 0.5515, "step": 655 }, { "epoch": 1.1528998242530757, "grad_norm": 0.9666351185341403, "learning_rate": 7.592542104115064e-05, "loss": 0.5526, "step": 656 }, { "epoch": 1.1546572934973638, "grad_norm": 0.8061862431532972, "learning_rate": 7.590380935150828e-05, "loss": 0.5535, "step": 657 }, { "epoch": 1.1564147627416521, "grad_norm": 0.6863550373271262, "learning_rate": 7.58821435913779e-05, "loss": 0.5509, "step": 658 }, { "epoch": 1.1581722319859402, "grad_norm": 0.5714702921167445, "learning_rate": 7.586042379338773e-05, "loss": 0.5541, "step": 659 }, { "epoch": 1.1599297012302285, "grad_norm": 0.6283903313193197, "learning_rate": 7.58386499902474e-05, "loss": 0.5594, "step": 660 }, { "epoch": 1.1616871704745166, "grad_norm": 0.6438824727962175, "learning_rate": 7.581682221474789e-05, "loss": 0.5484, "step": 661 }, { "epoch": 1.163444639718805, "grad_norm": 0.7198655314987372, "learning_rate": 7.579494049976139e-05, "loss": 0.5481, "step": 662 }, { "epoch": 1.165202108963093, "grad_norm": 0.6955599049516895, "learning_rate": 7.577300487824139e-05, "loss": 0.546, "step": 663 }, { "epoch": 1.1669595782073814, "grad_norm": 0.6430014204158956, "learning_rate": 7.575101538322254e-05, "loss": 0.5507, "step": 664 }, { "epoch": 1.1687170474516697, "grad_norm": 0.8357376047931928, "learning_rate": 7.572897204782062e-05, "loss": 0.5487, "step": 665 }, { "epoch": 1.1704745166959578, "grad_norm": 1.0932108274626335, "learning_rate": 7.570687490523249e-05, "loss": 0.5499, "step": 666 }, { "epoch": 1.1722319859402461, "grad_norm": 0.8302883030096349, "learning_rate": 7.568472398873606e-05, "loss": 0.5587, "step": 667 }, { "epoch": 1.1739894551845342, "grad_norm": 0.7229804360428674, "learning_rate": 7.56625193316902e-05, "loss": 0.5384, "step": 668 }, { "epoch": 1.1757469244288226, "grad_norm": 0.8196878857401476, "learning_rate": 7.564026096753472e-05, "loss": 0.5462, "step": 669 }, { "epoch": 1.1775043936731107, "grad_norm": 0.9076891455770392, "learning_rate": 7.561794892979033e-05, "loss": 0.546, "step": 670 }, { "epoch": 1.179261862917399, "grad_norm": 0.9606971247748932, "learning_rate": 7.559558325205853e-05, "loss": 0.5565, "step": 671 }, { "epoch": 1.181019332161687, "grad_norm": 1.058968631621876, "learning_rate": 7.557316396802164e-05, "loss": 0.5443, "step": 672 }, { "epoch": 1.1827768014059754, "grad_norm": 0.9570698951808254, "learning_rate": 7.555069111144271e-05, "loss": 0.5511, "step": 673 }, { "epoch": 1.1845342706502637, "grad_norm": 0.8895624121000857, "learning_rate": 7.552816471616544e-05, "loss": 0.5546, "step": 674 }, { "epoch": 1.1862917398945518, "grad_norm": 0.608908682825176, "learning_rate": 7.550558481611417e-05, "loss": 0.5412, "step": 675 }, { "epoch": 1.1880492091388402, "grad_norm": 0.5503065650603889, "learning_rate": 7.548295144529383e-05, "loss": 0.5589, "step": 676 }, { "epoch": 1.1898066783831283, "grad_norm": 0.618456670610612, "learning_rate": 7.546026463778987e-05, "loss": 0.543, "step": 677 }, { "epoch": 1.1915641476274166, "grad_norm": 0.622953253516586, "learning_rate": 7.54375244277682e-05, "loss": 0.5471, "step": 678 }, { "epoch": 1.1933216168717047, "grad_norm": 0.5350940809222805, "learning_rate": 7.541473084947518e-05, "loss": 0.5535, "step": 679 }, { "epoch": 1.195079086115993, "grad_norm": 0.6500444885051335, "learning_rate": 7.539188393723752e-05, "loss": 0.5558, "step": 680 }, { "epoch": 1.196836555360281, "grad_norm": 0.7395045500399698, "learning_rate": 7.536898372546226e-05, "loss": 0.5477, "step": 681 }, { "epoch": 1.1985940246045694, "grad_norm": 0.784987696529728, "learning_rate": 7.534603024863669e-05, "loss": 0.5538, "step": 682 }, { "epoch": 1.2003514938488578, "grad_norm": 0.8918863272305002, "learning_rate": 7.532302354132835e-05, "loss": 0.5628, "step": 683 }, { "epoch": 1.2021089630931459, "grad_norm": 0.94059950950928, "learning_rate": 7.529996363818493e-05, "loss": 0.5633, "step": 684 }, { "epoch": 1.2038664323374342, "grad_norm": 1.075189591657153, "learning_rate": 7.52768505739342e-05, "loss": 0.553, "step": 685 }, { "epoch": 1.2056239015817223, "grad_norm": 0.9543175780072978, "learning_rate": 7.525368438338405e-05, "loss": 0.5399, "step": 686 }, { "epoch": 1.2073813708260106, "grad_norm": 1.112675021250307, "learning_rate": 7.523046510142232e-05, "loss": 0.5585, "step": 687 }, { "epoch": 1.2091388400702987, "grad_norm": 0.5386678977659727, "learning_rate": 7.520719276301684e-05, "loss": 0.5468, "step": 688 }, { "epoch": 1.210896309314587, "grad_norm": 0.8096769900294886, "learning_rate": 7.518386740321532e-05, "loss": 0.5437, "step": 689 }, { "epoch": 1.2126537785588751, "grad_norm": 1.021383822250984, "learning_rate": 7.516048905714535e-05, "loss": 0.562, "step": 690 }, { "epoch": 1.2144112478031635, "grad_norm": 6.002581325365308, "learning_rate": 7.513705776001427e-05, "loss": 0.663, "step": 691 }, { "epoch": 1.2161687170474518, "grad_norm": 2.5613740292157394, "learning_rate": 7.511357354710922e-05, "loss": 0.6006, "step": 692 }, { "epoch": 1.2179261862917399, "grad_norm": 19.334601060412915, "learning_rate": 7.509003645379697e-05, "loss": 0.77, "step": 693 }, { "epoch": 1.2196836555360282, "grad_norm": 3.4651294415318157, "learning_rate": 7.506644651552398e-05, "loss": 0.6215, "step": 694 }, { "epoch": 1.2214411247803163, "grad_norm": 2.168020508785119, "learning_rate": 7.504280376781627e-05, "loss": 0.6281, "step": 695 }, { "epoch": 1.2231985940246046, "grad_norm": 2.036060807903825, "learning_rate": 7.501910824627938e-05, "loss": 0.6113, "step": 696 }, { "epoch": 1.2249560632688927, "grad_norm": 8.375829528486179, "learning_rate": 7.499535998659833e-05, "loss": 0.7452, "step": 697 }, { "epoch": 1.226713532513181, "grad_norm": 1.5289227278150497, "learning_rate": 7.49715590245376e-05, "loss": 0.6261, "step": 698 }, { "epoch": 1.2284710017574691, "grad_norm": 1.8969188939175117, "learning_rate": 7.494770539594099e-05, "loss": 0.6217, "step": 699 }, { "epoch": 1.2302284710017575, "grad_norm": 1.002846192785254, "learning_rate": 7.49237991367316e-05, "loss": 0.6135, "step": 700 }, { "epoch": 1.2319859402460458, "grad_norm": 5.02752253223691, "learning_rate": 7.48998402829119e-05, "loss": 0.6116, "step": 701 }, { "epoch": 1.233743409490334, "grad_norm": 3.0874634519640622, "learning_rate": 7.487582887056342e-05, "loss": 0.6637, "step": 702 }, { "epoch": 1.2355008787346222, "grad_norm": 1.198375859065789, "learning_rate": 7.485176493584694e-05, "loss": 0.6102, "step": 703 }, { "epoch": 1.2372583479789103, "grad_norm": 12.029892438175702, "learning_rate": 7.482764851500234e-05, "loss": 0.6269, "step": 704 }, { "epoch": 1.2390158172231986, "grad_norm": 3.619850621513468, "learning_rate": 7.480347964434846e-05, "loss": 0.684, "step": 705 }, { "epoch": 1.2407732864674867, "grad_norm": 25.22321982744411, "learning_rate": 7.477925836028322e-05, "loss": 0.6927, "step": 706 }, { "epoch": 1.242530755711775, "grad_norm": 21.79697000361752, "learning_rate": 7.475498469928343e-05, "loss": 0.7328, "step": 707 }, { "epoch": 1.2442882249560632, "grad_norm": 2.025880121499913, "learning_rate": 7.473065869790477e-05, "loss": 0.6854, "step": 708 }, { "epoch": 1.2460456942003515, "grad_norm": 1866.7110611903315, "learning_rate": 7.470628039278177e-05, "loss": 1.0741, "step": 709 }, { "epoch": 1.2478031634446398, "grad_norm": 2.303903418488874, "learning_rate": 7.468184982062771e-05, "loss": 0.7343, "step": 710 }, { "epoch": 1.249560632688928, "grad_norm": 4.2795503922292335, "learning_rate": 7.46573670182346e-05, "loss": 0.7132, "step": 711 }, { "epoch": 1.2513181019332162, "grad_norm": 1.847384860607174, "learning_rate": 7.46328320224731e-05, "loss": 0.6835, "step": 712 }, { "epoch": 1.2530755711775043, "grad_norm": 7.760485808300258, "learning_rate": 7.460824487029246e-05, "loss": 0.7462, "step": 713 }, { "epoch": 1.2548330404217927, "grad_norm": 2.904145303078465, "learning_rate": 7.458360559872048e-05, "loss": 0.7122, "step": 714 }, { "epoch": 1.2565905096660808, "grad_norm": 23.80338469920851, "learning_rate": 7.455891424486348e-05, "loss": 0.7815, "step": 715 }, { "epoch": 1.258347978910369, "grad_norm": 2.8846082759252876, "learning_rate": 7.453417084590616e-05, "loss": 0.7118, "step": 716 }, { "epoch": 1.2601054481546572, "grad_norm": 1.2048616948047342, "learning_rate": 7.450937543911169e-05, "loss": 0.6591, "step": 717 }, { "epoch": 1.2618629173989455, "grad_norm": 1.5806828116404865, "learning_rate": 7.448452806182143e-05, "loss": 0.6284, "step": 718 }, { "epoch": 1.2636203866432338, "grad_norm": 1.2096846484961472, "learning_rate": 7.445962875145514e-05, "loss": 0.6304, "step": 719 }, { "epoch": 1.265377855887522, "grad_norm": 1.0610373639593011, "learning_rate": 7.443467754551069e-05, "loss": 0.6129, "step": 720 }, { "epoch": 1.2671353251318103, "grad_norm": 6.906240517964775, "learning_rate": 7.440967448156419e-05, "loss": 0.647, "step": 721 }, { "epoch": 1.2688927943760984, "grad_norm": 1.313889781163848, "learning_rate": 7.438461959726976e-05, "loss": 0.6118, "step": 722 }, { "epoch": 1.2706502636203867, "grad_norm": 0.9835859433881419, "learning_rate": 7.435951293035961e-05, "loss": 0.618, "step": 723 }, { "epoch": 1.2724077328646748, "grad_norm": 1.3841908569169608, "learning_rate": 7.433435451864397e-05, "loss": 0.6155, "step": 724 }, { "epoch": 1.2741652021089631, "grad_norm": 0.9704882548660716, "learning_rate": 7.430914440001089e-05, "loss": 0.6187, "step": 725 }, { "epoch": 1.2759226713532512, "grad_norm": 1.2143367380607788, "learning_rate": 7.428388261242639e-05, "loss": 0.6013, "step": 726 }, { "epoch": 1.2776801405975395, "grad_norm": 0.7496436969199954, "learning_rate": 7.425856919393426e-05, "loss": 0.5925, "step": 727 }, { "epoch": 1.2794376098418279, "grad_norm": 0.84005253766964, "learning_rate": 7.423320418265606e-05, "loss": 0.5737, "step": 728 }, { "epoch": 1.281195079086116, "grad_norm": 0.8972139335458386, "learning_rate": 7.420778761679102e-05, "loss": 0.5795, "step": 729 }, { "epoch": 1.2829525483304043, "grad_norm": 0.7615210585510493, "learning_rate": 7.418231953461603e-05, "loss": 0.5927, "step": 730 }, { "epoch": 1.2847100175746924, "grad_norm": 0.6397457926608884, "learning_rate": 7.415679997448557e-05, "loss": 0.5782, "step": 731 }, { "epoch": 1.2864674868189807, "grad_norm": 0.7434174219365695, "learning_rate": 7.413122897483163e-05, "loss": 0.5777, "step": 732 }, { "epoch": 1.2882249560632688, "grad_norm": 0.7833820965976774, "learning_rate": 7.41056065741637e-05, "loss": 0.5851, "step": 733 }, { "epoch": 1.2899824253075571, "grad_norm": 0.7942055082850622, "learning_rate": 7.407993281106862e-05, "loss": 0.5767, "step": 734 }, { "epoch": 1.2917398945518452, "grad_norm": 0.5180380757298683, "learning_rate": 7.405420772421061e-05, "loss": 0.5718, "step": 735 }, { "epoch": 1.2934973637961336, "grad_norm": 0.69210691469269, "learning_rate": 7.402843135233122e-05, "loss": 0.5739, "step": 736 }, { "epoch": 1.2952548330404219, "grad_norm": 1.003849064597505, "learning_rate": 7.400260373424916e-05, "loss": 0.5691, "step": 737 }, { "epoch": 1.29701230228471, "grad_norm": 20.969654098370512, "learning_rate": 7.397672490886038e-05, "loss": 0.6421, "step": 738 }, { "epoch": 1.2987697715289983, "grad_norm": 3.076903920313403, "learning_rate": 7.395079491513793e-05, "loss": 0.6285, "step": 739 }, { "epoch": 1.3005272407732864, "grad_norm": 1.1720627476053382, "learning_rate": 7.39248137921319e-05, "loss": 0.6051, "step": 740 }, { "epoch": 1.3022847100175747, "grad_norm": 2.301063999863202, "learning_rate": 7.389878157896938e-05, "loss": 0.6138, "step": 741 }, { "epoch": 1.304042179261863, "grad_norm": 2.035873822579502, "learning_rate": 7.387269831485444e-05, "loss": 0.6081, "step": 742 }, { "epoch": 1.3057996485061512, "grad_norm": 1.1370616333081498, "learning_rate": 7.384656403906799e-05, "loss": 0.5809, "step": 743 }, { "epoch": 1.3075571177504393, "grad_norm": 0.9983696460902882, "learning_rate": 7.382037879096777e-05, "loss": 0.5896, "step": 744 }, { "epoch": 1.3093145869947276, "grad_norm": 0.9059072472942802, "learning_rate": 7.379414260998829e-05, "loss": 0.5847, "step": 745 }, { "epoch": 1.311072056239016, "grad_norm": 0.6898299028203713, "learning_rate": 7.376785553564077e-05, "loss": 0.5887, "step": 746 }, { "epoch": 1.312829525483304, "grad_norm": 0.7166024798142578, "learning_rate": 7.37415176075131e-05, "loss": 0.5722, "step": 747 }, { "epoch": 1.3145869947275923, "grad_norm": 0.8025469477480448, "learning_rate": 7.371512886526966e-05, "loss": 0.5641, "step": 748 }, { "epoch": 1.3163444639718804, "grad_norm": 0.6069312829823014, "learning_rate": 7.368868934865146e-05, "loss": 0.573, "step": 749 }, { "epoch": 1.3181019332161688, "grad_norm": 0.677410722395812, "learning_rate": 7.366219909747595e-05, "loss": 0.5655, "step": 750 }, { "epoch": 1.319859402460457, "grad_norm": 0.5706230869506272, "learning_rate": 7.363565815163692e-05, "loss": 0.5692, "step": 751 }, { "epoch": 1.3216168717047452, "grad_norm": 0.5360365357551748, "learning_rate": 7.36090665511046e-05, "loss": 0.5746, "step": 752 }, { "epoch": 1.3233743409490333, "grad_norm": 1.0318417682655034, "learning_rate": 7.358242433592543e-05, "loss": 0.5809, "step": 753 }, { "epoch": 1.3251318101933216, "grad_norm": 0.49778296194343685, "learning_rate": 7.355573154622213e-05, "loss": 0.5661, "step": 754 }, { "epoch": 1.32688927943761, "grad_norm": 0.5685616494332772, "learning_rate": 7.352898822219352e-05, "loss": 0.5689, "step": 755 }, { "epoch": 1.328646748681898, "grad_norm": 0.4821261459879989, "learning_rate": 7.350219440411462e-05, "loss": 0.5605, "step": 756 }, { "epoch": 1.3304042179261863, "grad_norm": 0.5515838686054965, "learning_rate": 7.347535013233637e-05, "loss": 0.556, "step": 757 }, { "epoch": 1.3321616871704745, "grad_norm": 0.47170410417457326, "learning_rate": 7.344845544728582e-05, "loss": 0.5661, "step": 758 }, { "epoch": 1.3339191564147628, "grad_norm": 0.47325145609421454, "learning_rate": 7.342151038946584e-05, "loss": 0.5612, "step": 759 }, { "epoch": 1.335676625659051, "grad_norm": 0.46630342892455334, "learning_rate": 7.33945149994552e-05, "loss": 0.5688, "step": 760 }, { "epoch": 1.3374340949033392, "grad_norm": 0.5550335360878358, "learning_rate": 7.33674693179085e-05, "loss": 0.5571, "step": 761 }, { "epoch": 1.3391915641476273, "grad_norm": 0.45563826323397005, "learning_rate": 7.334037338555602e-05, "loss": 0.5681, "step": 762 }, { "epoch": 1.3409490333919156, "grad_norm": 0.4456357311804823, "learning_rate": 7.331322724320375e-05, "loss": 0.5625, "step": 763 }, { "epoch": 1.342706502636204, "grad_norm": 0.4003831843128565, "learning_rate": 7.32860309317333e-05, "loss": 0.5588, "step": 764 }, { "epoch": 1.344463971880492, "grad_norm": 0.40270219690970305, "learning_rate": 7.325878449210182e-05, "loss": 0.573, "step": 765 }, { "epoch": 1.3462214411247804, "grad_norm": 0.48691118152773133, "learning_rate": 7.323148796534194e-05, "loss": 0.565, "step": 766 }, { "epoch": 1.3479789103690685, "grad_norm": 0.5504256432299914, "learning_rate": 7.320414139256176e-05, "loss": 0.5513, "step": 767 }, { "epoch": 1.3497363796133568, "grad_norm": 0.536228345105712, "learning_rate": 7.31767448149447e-05, "loss": 0.5586, "step": 768 }, { "epoch": 1.3514938488576451, "grad_norm": 0.6806548307287961, "learning_rate": 7.314929827374953e-05, "loss": 0.561, "step": 769 }, { "epoch": 1.3532513181019332, "grad_norm": 0.7750313379484527, "learning_rate": 7.312180181031024e-05, "loss": 0.5589, "step": 770 }, { "epoch": 1.3550087873462213, "grad_norm": 1.451113660121192, "learning_rate": 7.3094255466036e-05, "loss": 0.5838, "step": 771 }, { "epoch": 1.3567662565905096, "grad_norm": 0.47084999446718967, "learning_rate": 7.306665928241112e-05, "loss": 0.5615, "step": 772 }, { "epoch": 1.358523725834798, "grad_norm": 0.6661376362598277, "learning_rate": 7.303901330099493e-05, "loss": 0.5584, "step": 773 }, { "epoch": 1.360281195079086, "grad_norm": 0.8192107759730654, "learning_rate": 7.30113175634218e-05, "loss": 0.5495, "step": 774 }, { "epoch": 1.3620386643233744, "grad_norm": 1.2203769654236059, "learning_rate": 7.298357211140102e-05, "loss": 0.561, "step": 775 }, { "epoch": 1.3637961335676625, "grad_norm": 0.7174643503067877, "learning_rate": 7.295577698671671e-05, "loss": 0.558, "step": 776 }, { "epoch": 1.3655536028119508, "grad_norm": 0.4094779879899827, "learning_rate": 7.292793223122784e-05, "loss": 0.5573, "step": 777 }, { "epoch": 1.3673110720562391, "grad_norm": 0.9018168864255416, "learning_rate": 7.29000378868681e-05, "loss": 0.5606, "step": 778 }, { "epoch": 1.3690685413005272, "grad_norm": 1.196636827532728, "learning_rate": 7.28720939956459e-05, "loss": 0.5687, "step": 779 }, { "epoch": 1.3708260105448153, "grad_norm": 0.6473366321987943, "learning_rate": 7.28441005996442e-05, "loss": 0.5697, "step": 780 }, { "epoch": 1.3725834797891037, "grad_norm": 0.6606089309121528, "learning_rate": 7.281605774102054e-05, "loss": 0.5491, "step": 781 }, { "epoch": 1.374340949033392, "grad_norm": 0.4975677619420667, "learning_rate": 7.2787965462007e-05, "loss": 0.5395, "step": 782 }, { "epoch": 1.37609841827768, "grad_norm": 0.571880222973183, "learning_rate": 7.275982380491002e-05, "loss": 0.5663, "step": 783 }, { "epoch": 1.3778558875219684, "grad_norm": 0.696833553778467, "learning_rate": 7.273163281211043e-05, "loss": 0.5538, "step": 784 }, { "epoch": 1.3796133567662565, "grad_norm": 0.6107521055044687, "learning_rate": 7.270339252606335e-05, "loss": 0.5536, "step": 785 }, { "epoch": 1.3813708260105448, "grad_norm": 0.7146842840780441, "learning_rate": 7.267510298929815e-05, "loss": 0.5565, "step": 786 }, { "epoch": 1.3831282952548332, "grad_norm": 0.7109555127329377, "learning_rate": 7.264676424441836e-05, "loss": 0.5563, "step": 787 }, { "epoch": 1.3848857644991213, "grad_norm": 0.49912171922984855, "learning_rate": 7.26183763341016e-05, "loss": 0.5564, "step": 788 }, { "epoch": 1.3866432337434094, "grad_norm": 0.49061019231666086, "learning_rate": 7.258993930109958e-05, "loss": 0.5553, "step": 789 }, { "epoch": 1.3884007029876977, "grad_norm": 0.4714063961442492, "learning_rate": 7.256145318823795e-05, "loss": 0.5583, "step": 790 }, { "epoch": 1.390158172231986, "grad_norm": 0.5854772006795643, "learning_rate": 7.253291803841624e-05, "loss": 0.5537, "step": 791 }, { "epoch": 1.3919156414762741, "grad_norm": 0.6079648054102967, "learning_rate": 7.250433389460794e-05, "loss": 0.5564, "step": 792 }, { "epoch": 1.3936731107205624, "grad_norm": 0.5651938557451706, "learning_rate": 7.24757007998602e-05, "loss": 0.5691, "step": 793 }, { "epoch": 1.3954305799648505, "grad_norm": 0.6248536321517544, "learning_rate": 7.244701879729395e-05, "loss": 0.5678, "step": 794 }, { "epoch": 1.3971880492091389, "grad_norm": 0.6755764842700273, "learning_rate": 7.24182879301038e-05, "loss": 0.5462, "step": 795 }, { "epoch": 1.3989455184534272, "grad_norm": 0.5352332386189719, "learning_rate": 7.238950824155789e-05, "loss": 0.5445, "step": 796 }, { "epoch": 1.4007029876977153, "grad_norm": 0.4297986026892133, "learning_rate": 7.236067977499791e-05, "loss": 0.5539, "step": 797 }, { "epoch": 1.4024604569420034, "grad_norm": 0.44171487883428046, "learning_rate": 7.233180257383901e-05, "loss": 0.5552, "step": 798 }, { "epoch": 1.4042179261862917, "grad_norm": 0.42837089374478216, "learning_rate": 7.230287668156975e-05, "loss": 0.5424, "step": 799 }, { "epoch": 1.40597539543058, "grad_norm": 0.48597161239876896, "learning_rate": 7.227390214175199e-05, "loss": 0.5536, "step": 800 }, { "epoch": 1.4077328646748681, "grad_norm": 0.3755239336239794, "learning_rate": 7.224487899802084e-05, "loss": 0.5578, "step": 801 }, { "epoch": 1.4094903339191565, "grad_norm": 0.41328689561517234, "learning_rate": 7.221580729408468e-05, "loss": 0.5484, "step": 802 }, { "epoch": 1.4112478031634446, "grad_norm": 0.4693608613967653, "learning_rate": 7.218668707372493e-05, "loss": 0.5495, "step": 803 }, { "epoch": 1.4130052724077329, "grad_norm": 0.48523103846672755, "learning_rate": 7.215751838079613e-05, "loss": 0.5539, "step": 804 }, { "epoch": 1.4147627416520212, "grad_norm": 0.5718453978397501, "learning_rate": 7.21283012592258e-05, "loss": 0.5561, "step": 805 }, { "epoch": 1.4165202108963093, "grad_norm": 0.6393967216056681, "learning_rate": 7.209903575301442e-05, "loss": 0.5418, "step": 806 }, { "epoch": 1.4182776801405974, "grad_norm": 0.7532960168608119, "learning_rate": 7.206972190623527e-05, "loss": 0.5613, "step": 807 }, { "epoch": 1.4200351493848857, "grad_norm": 0.9732692136437875, "learning_rate": 7.20403597630345e-05, "loss": 0.5501, "step": 808 }, { "epoch": 1.421792618629174, "grad_norm": 1.1267126475479776, "learning_rate": 7.201094936763097e-05, "loss": 0.5445, "step": 809 }, { "epoch": 1.4235500878734622, "grad_norm": 0.6850379109063256, "learning_rate": 7.19814907643162e-05, "loss": 0.5608, "step": 810 }, { "epoch": 1.4253075571177505, "grad_norm": 0.3935570185092423, "learning_rate": 7.195198399745432e-05, "loss": 0.5397, "step": 811 }, { "epoch": 1.4270650263620386, "grad_norm": 0.568195867869367, "learning_rate": 7.192242911148198e-05, "loss": 0.5549, "step": 812 }, { "epoch": 1.428822495606327, "grad_norm": 0.6607315722196295, "learning_rate": 7.189282615090829e-05, "loss": 0.5629, "step": 813 }, { "epoch": 1.4305799648506152, "grad_norm": 0.7130377054303028, "learning_rate": 7.18631751603148e-05, "loss": 0.5643, "step": 814 }, { "epoch": 1.4323374340949033, "grad_norm": 0.8687522994158337, "learning_rate": 7.183347618435535e-05, "loss": 0.5442, "step": 815 }, { "epoch": 1.4340949033391914, "grad_norm": 1.0486220010783678, "learning_rate": 7.180372926775606e-05, "loss": 0.5571, "step": 816 }, { "epoch": 1.4358523725834798, "grad_norm": 0.9133346572544545, "learning_rate": 7.177393445531527e-05, "loss": 0.5538, "step": 817 }, { "epoch": 1.437609841827768, "grad_norm": 0.659260503011505, "learning_rate": 7.174409179190339e-05, "loss": 0.5484, "step": 818 }, { "epoch": 1.4393673110720562, "grad_norm": 0.5964480566089356, "learning_rate": 7.171420132246297e-05, "loss": 0.5584, "step": 819 }, { "epoch": 1.4411247803163445, "grad_norm": 0.6565613406925231, "learning_rate": 7.16842630920085e-05, "loss": 0.5492, "step": 820 }, { "epoch": 1.4428822495606326, "grad_norm": 0.6477756274931831, "learning_rate": 7.165427714562642e-05, "loss": 0.5531, "step": 821 }, { "epoch": 1.444639718804921, "grad_norm": 0.7072720118532487, "learning_rate": 7.1624243528475e-05, "loss": 0.5695, "step": 822 }, { "epoch": 1.4463971880492092, "grad_norm": 0.6230513372639942, "learning_rate": 7.159416228578437e-05, "loss": 0.548, "step": 823 }, { "epoch": 1.4481546572934973, "grad_norm": 0.6407309804003243, "learning_rate": 7.156403346285629e-05, "loss": 0.5683, "step": 824 }, { "epoch": 1.4499121265377855, "grad_norm": 0.5808825854231962, "learning_rate": 7.153385710506426e-05, "loss": 0.549, "step": 825 }, { "epoch": 1.4516695957820738, "grad_norm": 0.7017482563849676, "learning_rate": 7.150363325785331e-05, "loss": 0.5572, "step": 826 }, { "epoch": 1.453427065026362, "grad_norm": 0.6397423539824456, "learning_rate": 7.147336196674e-05, "loss": 0.5483, "step": 827 }, { "epoch": 1.4551845342706502, "grad_norm": 0.6998339179142865, "learning_rate": 7.144304327731237e-05, "loss": 0.5485, "step": 828 }, { "epoch": 1.4569420035149385, "grad_norm": 0.6799587459256526, "learning_rate": 7.14126772352298e-05, "loss": 0.5452, "step": 829 }, { "epoch": 1.4586994727592266, "grad_norm": 0.7363385961655733, "learning_rate": 7.138226388622302e-05, "loss": 0.5677, "step": 830 }, { "epoch": 1.460456942003515, "grad_norm": 0.4950628227047943, "learning_rate": 7.135180327609396e-05, "loss": 0.5411, "step": 831 }, { "epoch": 1.4622144112478033, "grad_norm": 0.3563462697661431, "learning_rate": 7.132129545071576e-05, "loss": 0.5545, "step": 832 }, { "epoch": 1.4639718804920914, "grad_norm": 0.5225915415978015, "learning_rate": 7.129074045603267e-05, "loss": 0.5441, "step": 833 }, { "epoch": 1.4657293497363797, "grad_norm": 0.487626393409654, "learning_rate": 7.126013833805993e-05, "loss": 0.5589, "step": 834 }, { "epoch": 1.4674868189806678, "grad_norm": 0.4137934702218913, "learning_rate": 7.122948914288378e-05, "loss": 0.5495, "step": 835 }, { "epoch": 1.4692442882249561, "grad_norm": 0.39980967772231896, "learning_rate": 7.119879291666138e-05, "loss": 0.542, "step": 836 }, { "epoch": 1.4710017574692442, "grad_norm": 0.3733319598263631, "learning_rate": 7.116804970562068e-05, "loss": 0.5555, "step": 837 }, { "epoch": 1.4727592267135325, "grad_norm": 0.3672915546215812, "learning_rate": 7.113725955606038e-05, "loss": 0.5538, "step": 838 }, { "epoch": 1.4745166959578206, "grad_norm": 0.4761548596505707, "learning_rate": 7.110642251434992e-05, "loss": 0.5522, "step": 839 }, { "epoch": 1.476274165202109, "grad_norm": 0.46195347213198434, "learning_rate": 7.10755386269293e-05, "loss": 0.5378, "step": 840 }, { "epoch": 1.4780316344463973, "grad_norm": 0.3635079953887453, "learning_rate": 7.104460794030912e-05, "loss": 0.5597, "step": 841 }, { "epoch": 1.4797891036906854, "grad_norm": 0.39102954909620596, "learning_rate": 7.101363050107041e-05, "loss": 0.5416, "step": 842 }, { "epoch": 1.4815465729349737, "grad_norm": 0.5216946003307621, "learning_rate": 7.098260635586467e-05, "loss": 0.5577, "step": 843 }, { "epoch": 1.4833040421792618, "grad_norm": 0.5849192388783214, "learning_rate": 7.095153555141367e-05, "loss": 0.5604, "step": 844 }, { "epoch": 1.4850615114235501, "grad_norm": 0.6043951089393184, "learning_rate": 7.092041813450948e-05, "loss": 0.546, "step": 845 }, { "epoch": 1.4868189806678382, "grad_norm": 0.7889824803288501, "learning_rate": 7.08892541520144e-05, "loss": 0.5467, "step": 846 }, { "epoch": 1.4885764499121266, "grad_norm": 1.0004422024199344, "learning_rate": 7.085804365086078e-05, "loss": 0.5541, "step": 847 }, { "epoch": 1.4903339191564147, "grad_norm": 1.2354745857028575, "learning_rate": 7.082678667805109e-05, "loss": 0.5623, "step": 848 }, { "epoch": 1.492091388400703, "grad_norm": 0.5638672397219355, "learning_rate": 7.079548328065779e-05, "loss": 0.5546, "step": 849 }, { "epoch": 1.4938488576449913, "grad_norm": 0.4795674968124183, "learning_rate": 7.076413350582319e-05, "loss": 0.5602, "step": 850 }, { "epoch": 1.4956063268892794, "grad_norm": 0.8481984988547863, "learning_rate": 7.073273740075951e-05, "loss": 0.5524, "step": 851 }, { "epoch": 1.4973637961335677, "grad_norm": 1.1004253732178093, "learning_rate": 7.070129501274871e-05, "loss": 0.5447, "step": 852 }, { "epoch": 1.4991212653778558, "grad_norm": 0.824005544401793, "learning_rate": 7.066980638914247e-05, "loss": 0.5612, "step": 853 }, { "epoch": 1.5008787346221442, "grad_norm": 0.5194329090562714, "learning_rate": 7.063827157736206e-05, "loss": 0.557, "step": 854 }, { "epoch": 1.5026362038664325, "grad_norm": 0.4228352028280271, "learning_rate": 7.060669062489837e-05, "loss": 0.5488, "step": 855 }, { "epoch": 1.5043936731107206, "grad_norm": 0.4567150420681907, "learning_rate": 7.057506357931172e-05, "loss": 0.5424, "step": 856 }, { "epoch": 1.5061511423550087, "grad_norm": 0.4984903161520427, "learning_rate": 7.054339048823187e-05, "loss": 0.5566, "step": 857 }, { "epoch": 1.507908611599297, "grad_norm": 0.49190372382563574, "learning_rate": 7.051167139935793e-05, "loss": 0.5526, "step": 858 }, { "epoch": 1.5096660808435853, "grad_norm": 0.43870177929713805, "learning_rate": 7.047990636045827e-05, "loss": 0.5595, "step": 859 }, { "epoch": 1.5114235500878734, "grad_norm": 0.38634276706406123, "learning_rate": 7.044809541937047e-05, "loss": 0.5482, "step": 860 }, { "epoch": 1.5131810193321615, "grad_norm": 0.4484119535541328, "learning_rate": 7.041623862400125e-05, "loss": 0.5489, "step": 861 }, { "epoch": 1.5149384885764499, "grad_norm": 0.44951890273338524, "learning_rate": 7.038433602232633e-05, "loss": 0.5451, "step": 862 }, { "epoch": 1.5166959578207382, "grad_norm": 0.4203135354487864, "learning_rate": 7.035238766239048e-05, "loss": 0.5533, "step": 863 }, { "epoch": 1.5184534270650265, "grad_norm": 0.453244444556481, "learning_rate": 7.032039359230732e-05, "loss": 0.5497, "step": 864 }, { "epoch": 1.5202108963093146, "grad_norm": 0.4190222514263043, "learning_rate": 7.028835386025939e-05, "loss": 0.5492, "step": 865 }, { "epoch": 1.5219683655536027, "grad_norm": 0.3834212968446958, "learning_rate": 7.025626851449791e-05, "loss": 0.5483, "step": 866 }, { "epoch": 1.523725834797891, "grad_norm": 0.4788117316024204, "learning_rate": 7.022413760334285e-05, "loss": 0.5562, "step": 867 }, { "epoch": 1.5254833040421794, "grad_norm": 0.3464280348899741, "learning_rate": 7.019196117518276e-05, "loss": 0.5677, "step": 868 }, { "epoch": 1.5272407732864675, "grad_norm": 0.3918765585777357, "learning_rate": 7.015973927847479e-05, "loss": 0.5493, "step": 869 }, { "epoch": 1.5289982425307556, "grad_norm": 0.5073579793754366, "learning_rate": 7.012747196174451e-05, "loss": 0.5525, "step": 870 }, { "epoch": 1.5307557117750439, "grad_norm": 0.5400837964397749, "learning_rate": 7.009515927358592e-05, "loss": 0.5451, "step": 871 }, { "epoch": 1.5325131810193322, "grad_norm": 0.6971662783713828, "learning_rate": 7.006280126266134e-05, "loss": 0.5391, "step": 872 }, { "epoch": 1.5342706502636205, "grad_norm": 0.8429917461228013, "learning_rate": 7.003039797770138e-05, "loss": 0.5637, "step": 873 }, { "epoch": 1.5360281195079086, "grad_norm": 0.959460994758824, "learning_rate": 6.999794946750477e-05, "loss": 0.5461, "step": 874 }, { "epoch": 1.5377855887521967, "grad_norm": 1.1062670517477464, "learning_rate": 6.996545578093838e-05, "loss": 0.547, "step": 875 }, { "epoch": 1.539543057996485, "grad_norm": 0.7910532284392084, "learning_rate": 6.993291696693712e-05, "loss": 0.5366, "step": 876 }, { "epoch": 1.5413005272407734, "grad_norm": 0.5146442962288506, "learning_rate": 6.990033307450388e-05, "loss": 0.5456, "step": 877 }, { "epoch": 1.5430579964850615, "grad_norm": 0.4057363691712105, "learning_rate": 6.986770415270938e-05, "loss": 0.5504, "step": 878 }, { "epoch": 1.5448154657293496, "grad_norm": 0.5549114247615515, "learning_rate": 6.98350302506922e-05, "loss": 0.555, "step": 879 }, { "epoch": 1.546572934973638, "grad_norm": 0.7324815708057197, "learning_rate": 6.980231141765865e-05, "loss": 0.554, "step": 880 }, { "epoch": 1.5483304042179262, "grad_norm": 0.8387306859683366, "learning_rate": 6.976954770288268e-05, "loss": 0.5699, "step": 881 }, { "epoch": 1.5500878734622145, "grad_norm": 0.8585763291880227, "learning_rate": 6.973673915570589e-05, "loss": 0.5537, "step": 882 }, { "epoch": 1.5518453427065027, "grad_norm": 0.7358713373724051, "learning_rate": 6.970388582553733e-05, "loss": 0.5472, "step": 883 }, { "epoch": 1.5536028119507908, "grad_norm": 0.5413528616877977, "learning_rate": 6.967098776185353e-05, "loss": 0.5386, "step": 884 }, { "epoch": 1.555360281195079, "grad_norm": 0.46550007235321395, "learning_rate": 6.963804501419837e-05, "loss": 0.5503, "step": 885 }, { "epoch": 1.5571177504393674, "grad_norm": 0.5419395222331074, "learning_rate": 6.960505763218305e-05, "loss": 0.5403, "step": 886 }, { "epoch": 1.5588752196836555, "grad_norm": 0.5942936502346011, "learning_rate": 6.957202566548596e-05, "loss": 0.5488, "step": 887 }, { "epoch": 1.5606326889279436, "grad_norm": 0.6170362712984698, "learning_rate": 6.953894916385266e-05, "loss": 0.5485, "step": 888 }, { "epoch": 1.562390158172232, "grad_norm": 0.5694059337877693, "learning_rate": 6.950582817709573e-05, "loss": 0.5555, "step": 889 }, { "epoch": 1.5641476274165202, "grad_norm": 0.4986078590522339, "learning_rate": 6.94726627550948e-05, "loss": 0.5607, "step": 890 }, { "epoch": 1.5659050966608086, "grad_norm": 0.5362063021828795, "learning_rate": 6.94394529477964e-05, "loss": 0.5439, "step": 891 }, { "epoch": 1.5676625659050967, "grad_norm": 0.6123630728578692, "learning_rate": 6.94061988052139e-05, "loss": 0.5525, "step": 892 }, { "epoch": 1.5694200351493848, "grad_norm": 0.6320763158312059, "learning_rate": 6.937290037742743e-05, "loss": 0.5561, "step": 893 }, { "epoch": 1.571177504393673, "grad_norm": 0.670327990567313, "learning_rate": 6.933955771458382e-05, "loss": 0.5504, "step": 894 }, { "epoch": 1.5729349736379614, "grad_norm": 0.7204055691272119, "learning_rate": 6.930617086689651e-05, "loss": 0.5486, "step": 895 }, { "epoch": 1.5746924428822495, "grad_norm": 0.6852718711754624, "learning_rate": 6.927273988464552e-05, "loss": 0.5566, "step": 896 }, { "epoch": 1.5764499121265376, "grad_norm": 0.6542148093490074, "learning_rate": 6.923926481817728e-05, "loss": 0.5448, "step": 897 }, { "epoch": 1.578207381370826, "grad_norm": 0.6067364487525669, "learning_rate": 6.920574571790463e-05, "loss": 0.5445, "step": 898 }, { "epoch": 1.5799648506151143, "grad_norm": 0.5036000105229465, "learning_rate": 6.917218263430675e-05, "loss": 0.547, "step": 899 }, { "epoch": 1.5817223198594026, "grad_norm": 0.5839110872079004, "learning_rate": 6.913857561792901e-05, "loss": 0.5453, "step": 900 }, { "epoch": 1.5834797891036907, "grad_norm": 0.5685868246085904, "learning_rate": 6.9104924719383e-05, "loss": 0.537, "step": 901 }, { "epoch": 1.5852372583479788, "grad_norm": 0.44929454102344507, "learning_rate": 6.907122998934635e-05, "loss": 0.5326, "step": 902 }, { "epoch": 1.5869947275922671, "grad_norm": 0.39252781885942956, "learning_rate": 6.903749147856272e-05, "loss": 0.5532, "step": 903 }, { "epoch": 1.5887521968365554, "grad_norm": 0.4662675134775701, "learning_rate": 6.900370923784166e-05, "loss": 0.5352, "step": 904 }, { "epoch": 1.5905096660808435, "grad_norm": 0.5662582964947827, "learning_rate": 6.89698833180587e-05, "loss": 0.5478, "step": 905 }, { "epoch": 1.5922671353251316, "grad_norm": 0.5975242981851273, "learning_rate": 6.893601377015497e-05, "loss": 0.5475, "step": 906 }, { "epoch": 1.59402460456942, "grad_norm": 0.6702799015012032, "learning_rate": 6.890210064513745e-05, "loss": 0.5391, "step": 907 }, { "epoch": 1.5957820738137083, "grad_norm": 0.6980489306240637, "learning_rate": 6.886814399407867e-05, "loss": 0.537, "step": 908 }, { "epoch": 1.5975395430579966, "grad_norm": 0.6383496644226038, "learning_rate": 6.883414386811676e-05, "loss": 0.553, "step": 909 }, { "epoch": 1.5992970123022847, "grad_norm": 0.5547250231543184, "learning_rate": 6.880010031845528e-05, "loss": 0.5358, "step": 910 }, { "epoch": 1.6010544815465728, "grad_norm": 0.3788603393092982, "learning_rate": 6.876601339636319e-05, "loss": 0.5406, "step": 911 }, { "epoch": 1.6028119507908611, "grad_norm": 0.3377205172277864, "learning_rate": 6.873188315317478e-05, "loss": 0.547, "step": 912 }, { "epoch": 1.6045694200351495, "grad_norm": 0.505323119191615, "learning_rate": 6.869770964028962e-05, "loss": 0.5387, "step": 913 }, { "epoch": 1.6063268892794376, "grad_norm": 0.698879600123432, "learning_rate": 6.866349290917233e-05, "loss": 0.539, "step": 914 }, { "epoch": 1.6080843585237259, "grad_norm": 0.9286688185121885, "learning_rate": 6.862923301135275e-05, "loss": 0.5433, "step": 915 }, { "epoch": 1.609841827768014, "grad_norm": 1.0835141579315193, "learning_rate": 6.859492999842564e-05, "loss": 0.5453, "step": 916 }, { "epoch": 1.6115992970123023, "grad_norm": 0.8468982936437256, "learning_rate": 6.856058392205073e-05, "loss": 0.542, "step": 917 }, { "epoch": 1.6133567662565906, "grad_norm": 0.6009257042587948, "learning_rate": 6.852619483395259e-05, "loss": 0.5556, "step": 918 }, { "epoch": 1.6151142355008787, "grad_norm": 0.4359816883321716, "learning_rate": 6.849176278592055e-05, "loss": 0.5223, "step": 919 }, { "epoch": 1.6168717047451668, "grad_norm": 0.6277432799952835, "learning_rate": 6.845728782980866e-05, "loss": 0.553, "step": 920 }, { "epoch": 1.6186291739894552, "grad_norm": 0.830270116639101, "learning_rate": 6.842277001753559e-05, "loss": 0.5564, "step": 921 }, { "epoch": 1.6203866432337435, "grad_norm": 0.8420249429363015, "learning_rate": 6.838820940108452e-05, "loss": 0.5441, "step": 922 }, { "epoch": 1.6221441124780316, "grad_norm": 0.6469592145433414, "learning_rate": 6.835360603250314e-05, "loss": 0.5517, "step": 923 }, { "epoch": 1.62390158172232, "grad_norm": 0.499839345422203, "learning_rate": 6.831895996390349e-05, "loss": 0.5648, "step": 924 }, { "epoch": 1.625659050966608, "grad_norm": 0.4324939387413902, "learning_rate": 6.828427124746191e-05, "loss": 0.5393, "step": 925 }, { "epoch": 1.6274165202108963, "grad_norm": 0.47983327282435007, "learning_rate": 6.824953993541898e-05, "loss": 0.5444, "step": 926 }, { "epoch": 1.6291739894551847, "grad_norm": 0.47249894646244706, "learning_rate": 6.821476608007945e-05, "loss": 0.533, "step": 927 }, { "epoch": 1.6309314586994728, "grad_norm": 0.5238839113810849, "learning_rate": 6.81799497338121e-05, "loss": 0.538, "step": 928 }, { "epoch": 1.6326889279437609, "grad_norm": 0.5512731116491615, "learning_rate": 6.814509094904973e-05, "loss": 0.5473, "step": 929 }, { "epoch": 1.6344463971880492, "grad_norm": 0.5380460182867385, "learning_rate": 6.811018977828901e-05, "loss": 0.5716, "step": 930 }, { "epoch": 1.6362038664323375, "grad_norm": 0.5076173074809301, "learning_rate": 6.807524627409052e-05, "loss": 0.5584, "step": 931 }, { "epoch": 1.6379613356766256, "grad_norm": 0.5114230826927655, "learning_rate": 6.804026048907851e-05, "loss": 0.5423, "step": 932 }, { "epoch": 1.639718804920914, "grad_norm": 0.5364047542551846, "learning_rate": 6.800523247594095e-05, "loss": 0.5464, "step": 933 }, { "epoch": 1.641476274165202, "grad_norm": 0.5039624235115675, "learning_rate": 6.797016228742939e-05, "loss": 0.5405, "step": 934 }, { "epoch": 1.6432337434094904, "grad_norm": 0.5417546207091917, "learning_rate": 6.793504997635893e-05, "loss": 0.5457, "step": 935 }, { "epoch": 1.6449912126537787, "grad_norm": 0.5395366802921007, "learning_rate": 6.789989559560802e-05, "loss": 0.5429, "step": 936 }, { "epoch": 1.6467486818980668, "grad_norm": 0.5631756356623545, "learning_rate": 6.786469919811857e-05, "loss": 0.5338, "step": 937 }, { "epoch": 1.6485061511423549, "grad_norm": 0.5964114260240468, "learning_rate": 6.78294608368957e-05, "loss": 0.5547, "step": 938 }, { "epoch": 1.6502636203866432, "grad_norm": 0.561287127106773, "learning_rate": 6.779418056500774e-05, "loss": 0.5505, "step": 939 }, { "epoch": 1.6520210896309315, "grad_norm": 0.6646352425710532, "learning_rate": 6.775885843558616e-05, "loss": 0.5437, "step": 940 }, { "epoch": 1.6537785588752196, "grad_norm": 0.6816099409505159, "learning_rate": 6.772349450182545e-05, "loss": 0.5574, "step": 941 }, { "epoch": 1.655536028119508, "grad_norm": 0.6651018979160451, "learning_rate": 6.768808881698302e-05, "loss": 0.5562, "step": 942 }, { "epoch": 1.657293497363796, "grad_norm": 0.6665837595452804, "learning_rate": 6.765264143437921e-05, "loss": 0.5419, "step": 943 }, { "epoch": 1.6590509666080844, "grad_norm": 0.5953485309939044, "learning_rate": 6.761715240739717e-05, "loss": 0.5422, "step": 944 }, { "epoch": 1.6608084358523727, "grad_norm": 0.5567474243617794, "learning_rate": 6.758162178948268e-05, "loss": 0.5448, "step": 945 }, { "epoch": 1.6625659050966608, "grad_norm": 0.6932832445092652, "learning_rate": 6.754604963414425e-05, "loss": 0.569, "step": 946 }, { "epoch": 1.664323374340949, "grad_norm": 0.5556616400170085, "learning_rate": 6.751043599495286e-05, "loss": 0.5523, "step": 947 }, { "epoch": 1.6660808435852372, "grad_norm": 0.46863398757624386, "learning_rate": 6.747478092554207e-05, "loss": 0.5415, "step": 948 }, { "epoch": 1.6678383128295255, "grad_norm": 0.35851513875901314, "learning_rate": 6.743908447960772e-05, "loss": 0.5523, "step": 949 }, { "epoch": 1.6695957820738139, "grad_norm": 0.3458096629260963, "learning_rate": 6.740334671090802e-05, "loss": 0.5524, "step": 950 }, { "epoch": 1.671353251318102, "grad_norm": 0.3725131840249367, "learning_rate": 6.736756767326341e-05, "loss": 0.5372, "step": 951 }, { "epoch": 1.67311072056239, "grad_norm": 0.30684989091705006, "learning_rate": 6.733174742055649e-05, "loss": 0.547, "step": 952 }, { "epoch": 1.6748681898066784, "grad_norm": 0.30893990541984506, "learning_rate": 6.729588600673187e-05, "loss": 0.5631, "step": 953 }, { "epoch": 1.6766256590509667, "grad_norm": 0.47730413996074295, "learning_rate": 6.725998348579625e-05, "loss": 0.5466, "step": 954 }, { "epoch": 1.6783831282952548, "grad_norm": 0.5226082207607079, "learning_rate": 6.722403991181813e-05, "loss": 0.5336, "step": 955 }, { "epoch": 1.680140597539543, "grad_norm": 0.5507900199105874, "learning_rate": 6.718805533892789e-05, "loss": 0.5408, "step": 956 }, { "epoch": 1.6818980667838312, "grad_norm": 0.5171793113248411, "learning_rate": 6.715202982131768e-05, "loss": 0.5329, "step": 957 }, { "epoch": 1.6836555360281196, "grad_norm": 0.5062731212886052, "learning_rate": 6.711596341324123e-05, "loss": 0.5473, "step": 958 }, { "epoch": 1.685413005272408, "grad_norm": 0.5920879617275819, "learning_rate": 6.707985616901394e-05, "loss": 0.5581, "step": 959 }, { "epoch": 1.687170474516696, "grad_norm": 0.6552480998445063, "learning_rate": 6.704370814301264e-05, "loss": 0.5382, "step": 960 }, { "epoch": 1.688927943760984, "grad_norm": 0.7704789540224866, "learning_rate": 6.700751938967563e-05, "loss": 0.5512, "step": 961 }, { "epoch": 1.6906854130052724, "grad_norm": 0.8367591208015782, "learning_rate": 6.697128996350249e-05, "loss": 0.5541, "step": 962 }, { "epoch": 1.6924428822495607, "grad_norm": 0.8203787967719358, "learning_rate": 6.69350199190541e-05, "loss": 0.535, "step": 963 }, { "epoch": 1.6942003514938488, "grad_norm": 0.8762318550716108, "learning_rate": 6.689870931095247e-05, "loss": 0.5358, "step": 964 }, { "epoch": 1.695957820738137, "grad_norm": 0.9427927295035419, "learning_rate": 6.686235819388075e-05, "loss": 0.5538, "step": 965 }, { "epoch": 1.6977152899824253, "grad_norm": 1.1012397704193815, "learning_rate": 6.682596662258304e-05, "loss": 0.5535, "step": 966 }, { "epoch": 1.6994727592267136, "grad_norm": 0.943029054290013, "learning_rate": 6.67895346518644e-05, "loss": 0.5493, "step": 967 }, { "epoch": 1.701230228471002, "grad_norm": 0.8108356234666215, "learning_rate": 6.675306233659073e-05, "loss": 0.558, "step": 968 }, { "epoch": 1.70298769771529, "grad_norm": 0.7087643067398374, "learning_rate": 6.671654973168865e-05, "loss": 0.5384, "step": 969 }, { "epoch": 1.7047451669595781, "grad_norm": 0.5917586567647963, "learning_rate": 6.667999689214551e-05, "loss": 0.5409, "step": 970 }, { "epoch": 1.7065026362038664, "grad_norm": 0.5528703524765244, "learning_rate": 6.664340387300922e-05, "loss": 0.5481, "step": 971 }, { "epoch": 1.7082601054481548, "grad_norm": 0.5152612241756322, "learning_rate": 6.660677072938818e-05, "loss": 0.5411, "step": 972 }, { "epoch": 1.7100175746924429, "grad_norm": 0.5564002888112733, "learning_rate": 6.657009751645128e-05, "loss": 0.5545, "step": 973 }, { "epoch": 1.711775043936731, "grad_norm": 0.6913012468333406, "learning_rate": 6.653338428942768e-05, "loss": 0.5557, "step": 974 }, { "epoch": 1.7135325131810193, "grad_norm": 0.7426250265259617, "learning_rate": 6.649663110360688e-05, "loss": 0.5538, "step": 975 }, { "epoch": 1.7152899824253076, "grad_norm": 0.706944138469579, "learning_rate": 6.645983801433847e-05, "loss": 0.5437, "step": 976 }, { "epoch": 1.717047451669596, "grad_norm": 0.7038634165787659, "learning_rate": 6.642300507703222e-05, "loss": 0.5485, "step": 977 }, { "epoch": 1.718804920913884, "grad_norm": 0.6603058042978189, "learning_rate": 6.638613234715781e-05, "loss": 0.5546, "step": 978 }, { "epoch": 1.7205623901581721, "grad_norm": 0.4996690950658728, "learning_rate": 6.634921988024496e-05, "loss": 0.5508, "step": 979 }, { "epoch": 1.7223198594024605, "grad_norm": 0.369867117922825, "learning_rate": 6.631226773188316e-05, "loss": 0.5473, "step": 980 }, { "epoch": 1.7240773286467488, "grad_norm": 0.9396974524658214, "learning_rate": 6.627527595772166e-05, "loss": 0.5579, "step": 981 }, { "epoch": 1.7258347978910369, "grad_norm": 6.76720536530387, "learning_rate": 6.623824461346943e-05, "loss": 0.5513, "step": 982 }, { "epoch": 1.727592267135325, "grad_norm": 4.331010235965194, "learning_rate": 6.620117375489498e-05, "loss": 0.6128, "step": 983 }, { "epoch": 1.7293497363796133, "grad_norm": 0.7702930133115985, "learning_rate": 6.616406343782637e-05, "loss": 0.5666, "step": 984 }, { "epoch": 1.7311072056239016, "grad_norm": 1.6229473042638018, "learning_rate": 6.612691371815106e-05, "loss": 0.5676, "step": 985 }, { "epoch": 1.73286467486819, "grad_norm": 0.666147894200385, "learning_rate": 6.608972465181589e-05, "loss": 0.5464, "step": 986 }, { "epoch": 1.734622144112478, "grad_norm": 1.2361896400346648, "learning_rate": 6.605249629482686e-05, "loss": 0.5516, "step": 987 }, { "epoch": 1.7363796133567662, "grad_norm": 0.659149231512002, "learning_rate": 6.601522870324925e-05, "loss": 0.5565, "step": 988 }, { "epoch": 1.7381370826010545, "grad_norm": 1.0072492652944145, "learning_rate": 6.597792193320734e-05, "loss": 0.5432, "step": 989 }, { "epoch": 1.7398945518453428, "grad_norm": 0.9279257388298712, "learning_rate": 6.59405760408845e-05, "loss": 0.5545, "step": 990 }, { "epoch": 1.741652021089631, "grad_norm": 0.7563469814342753, "learning_rate": 6.590319108252294e-05, "loss": 0.554, "step": 991 }, { "epoch": 1.743409490333919, "grad_norm": 0.8395144246131742, "learning_rate": 6.586576711442373e-05, "loss": 0.5486, "step": 992 }, { "epoch": 1.7451669595782073, "grad_norm": 0.7030091308694886, "learning_rate": 6.58283041929467e-05, "loss": 0.5441, "step": 993 }, { "epoch": 1.7469244288224957, "grad_norm": 0.7528371762948625, "learning_rate": 6.579080237451032e-05, "loss": 0.5666, "step": 994 }, { "epoch": 1.748681898066784, "grad_norm": 0.7512415258982409, "learning_rate": 6.575326171559165e-05, "loss": 0.5407, "step": 995 }, { "epoch": 1.750439367311072, "grad_norm": 0.6780058931865728, "learning_rate": 6.571568227272629e-05, "loss": 0.5467, "step": 996 }, { "epoch": 1.7521968365553602, "grad_norm": 0.572187171099151, "learning_rate": 6.567806410250812e-05, "loss": 0.546, "step": 997 }, { "epoch": 1.7539543057996485, "grad_norm": 0.5190295328948532, "learning_rate": 6.564040726158951e-05, "loss": 0.5345, "step": 998 }, { "epoch": 1.7557117750439368, "grad_norm": 0.5457097852249042, "learning_rate": 6.560271180668091e-05, "loss": 0.5596, "step": 999 }, { "epoch": 1.757469244288225, "grad_norm": 0.7197384475003286, "learning_rate": 6.556497779455104e-05, "loss": 0.5578, "step": 1000 }, { "epoch": 1.759226713532513, "grad_norm": 0.4342676668442805, "learning_rate": 6.552720528202662e-05, "loss": 0.5594, "step": 1001 }, { "epoch": 1.7609841827768014, "grad_norm": 0.5472306061576313, "learning_rate": 6.548939432599237e-05, "loss": 0.5381, "step": 1002 }, { "epoch": 1.7627416520210897, "grad_norm": 0.558366659264968, "learning_rate": 6.545154498339093e-05, "loss": 0.5534, "step": 1003 }, { "epoch": 1.764499121265378, "grad_norm": 0.4720298005041288, "learning_rate": 6.541365731122268e-05, "loss": 0.5516, "step": 1004 }, { "epoch": 1.766256590509666, "grad_norm": 0.590819988181891, "learning_rate": 6.537573136654582e-05, "loss": 0.5513, "step": 1005 }, { "epoch": 1.7680140597539542, "grad_norm": 0.5189897176356834, "learning_rate": 6.533776720647613e-05, "loss": 0.5371, "step": 1006 }, { "epoch": 1.7697715289982425, "grad_norm": 0.5276281675793548, "learning_rate": 6.52997648881869e-05, "loss": 0.5347, "step": 1007 }, { "epoch": 1.7715289982425309, "grad_norm": 0.4695917524937606, "learning_rate": 6.526172446890899e-05, "loss": 0.5315, "step": 1008 }, { "epoch": 1.773286467486819, "grad_norm": 0.49042981023280513, "learning_rate": 6.522364600593056e-05, "loss": 0.5484, "step": 1009 }, { "epoch": 1.775043936731107, "grad_norm": 0.5038456541790495, "learning_rate": 6.51855295565971e-05, "loss": 0.5444, "step": 1010 }, { "epoch": 1.7768014059753954, "grad_norm": 0.4180887267528924, "learning_rate": 6.514737517831126e-05, "loss": 0.5418, "step": 1011 }, { "epoch": 1.7785588752196837, "grad_norm": 0.3810547979652265, "learning_rate": 6.510918292853288e-05, "loss": 0.5256, "step": 1012 }, { "epoch": 1.780316344463972, "grad_norm": 0.42438963305728794, "learning_rate": 6.507095286477879e-05, "loss": 0.5569, "step": 1013 }, { "epoch": 1.7820738137082601, "grad_norm": 0.3996573319174155, "learning_rate": 6.503268504462276e-05, "loss": 0.5641, "step": 1014 }, { "epoch": 1.7838312829525482, "grad_norm": 0.34776883234223666, "learning_rate": 6.499437952569547e-05, "loss": 0.5471, "step": 1015 }, { "epoch": 1.7855887521968365, "grad_norm": 0.44896489748482893, "learning_rate": 6.49560363656843e-05, "loss": 0.5516, "step": 1016 }, { "epoch": 1.7873462214411249, "grad_norm": 0.5355385326342952, "learning_rate": 6.491765562233336e-05, "loss": 0.5473, "step": 1017 }, { "epoch": 1.789103690685413, "grad_norm": 0.6049677458333957, "learning_rate": 6.487923735344339e-05, "loss": 0.541, "step": 1018 }, { "epoch": 1.790861159929701, "grad_norm": 0.6171703330045071, "learning_rate": 6.484078161687156e-05, "loss": 0.5264, "step": 1019 }, { "epoch": 1.7926186291739894, "grad_norm": 0.6251581682303455, "learning_rate": 6.480228847053157e-05, "loss": 0.5513, "step": 1020 }, { "epoch": 1.7943760984182777, "grad_norm": 0.8935818842443662, "learning_rate": 6.476375797239338e-05, "loss": 0.543, "step": 1021 }, { "epoch": 1.796133567662566, "grad_norm": 1.1690062089813291, "learning_rate": 6.472519018048318e-05, "loss": 0.5355, "step": 1022 }, { "epoch": 1.7978910369068541, "grad_norm": 0.7750744227861049, "learning_rate": 6.468658515288343e-05, "loss": 0.5327, "step": 1023 }, { "epoch": 1.7996485061511422, "grad_norm": 0.6395108316269777, "learning_rate": 6.464794294773257e-05, "loss": 0.544, "step": 1024 }, { "epoch": 1.8014059753954306, "grad_norm": 0.536753644575741, "learning_rate": 6.460926362322507e-05, "loss": 0.546, "step": 1025 }, { "epoch": 1.803163444639719, "grad_norm": 0.4110913092945629, "learning_rate": 6.45705472376113e-05, "loss": 0.557, "step": 1026 }, { "epoch": 1.804920913884007, "grad_norm": 0.7563766620814675, "learning_rate": 6.453179384919745e-05, "loss": 0.5417, "step": 1027 }, { "epoch": 1.806678383128295, "grad_norm": 1.0692078755671028, "learning_rate": 6.449300351634538e-05, "loss": 0.5552, "step": 1028 }, { "epoch": 1.8084358523725834, "grad_norm": 0.9340988023006458, "learning_rate": 6.445417629747266e-05, "loss": 0.5578, "step": 1029 }, { "epoch": 1.8101933216168717, "grad_norm": 0.7817431878745816, "learning_rate": 6.441531225105238e-05, "loss": 0.5415, "step": 1030 }, { "epoch": 1.81195079086116, "grad_norm": 0.5830944116618259, "learning_rate": 6.43764114356131e-05, "loss": 0.5463, "step": 1031 }, { "epoch": 1.8137082601054482, "grad_norm": 0.38609286791395236, "learning_rate": 6.433747390973871e-05, "loss": 0.5405, "step": 1032 }, { "epoch": 1.8154657293497363, "grad_norm": 0.3433594626220631, "learning_rate": 6.429849973206844e-05, "loss": 0.5328, "step": 1033 }, { "epoch": 1.8172231985940246, "grad_norm": 0.35045260636279674, "learning_rate": 6.425948896129671e-05, "loss": 0.5384, "step": 1034 }, { "epoch": 1.818980667838313, "grad_norm": 0.37590188098831456, "learning_rate": 6.422044165617304e-05, "loss": 0.5458, "step": 1035 }, { "epoch": 1.820738137082601, "grad_norm": 0.40781047173996077, "learning_rate": 6.418135787550193e-05, "loss": 0.5432, "step": 1036 }, { "epoch": 1.8224956063268891, "grad_norm": 0.3119990672776977, "learning_rate": 6.414223767814286e-05, "loss": 0.5446, "step": 1037 }, { "epoch": 1.8242530755711774, "grad_norm": 0.31335544311966734, "learning_rate": 6.410308112301017e-05, "loss": 0.5504, "step": 1038 }, { "epoch": 1.8260105448154658, "grad_norm": 0.34253445455223386, "learning_rate": 6.406388826907288e-05, "loss": 0.5461, "step": 1039 }, { "epoch": 1.827768014059754, "grad_norm": 0.37007095935502476, "learning_rate": 6.402465917535477e-05, "loss": 0.5453, "step": 1040 }, { "epoch": 1.8295254833040422, "grad_norm": 0.36346249453364077, "learning_rate": 6.39853939009341e-05, "loss": 0.5369, "step": 1041 }, { "epoch": 1.8312829525483303, "grad_norm": 0.39895151757416086, "learning_rate": 6.394609250494367e-05, "loss": 0.5364, "step": 1042 }, { "epoch": 1.8330404217926186, "grad_norm": 0.4451711481498326, "learning_rate": 6.390675504657072e-05, "loss": 0.5296, "step": 1043 }, { "epoch": 1.834797891036907, "grad_norm": 0.5023716213197088, "learning_rate": 6.386738158505669e-05, "loss": 0.5483, "step": 1044 }, { "epoch": 1.836555360281195, "grad_norm": 0.5388687095599589, "learning_rate": 6.382797217969734e-05, "loss": 0.5425, "step": 1045 }, { "epoch": 1.8383128295254831, "grad_norm": 0.7011308615711588, "learning_rate": 6.378852688984251e-05, "loss": 0.5321, "step": 1046 }, { "epoch": 1.8400702987697715, "grad_norm": 0.8788462409420582, "learning_rate": 6.374904577489608e-05, "loss": 0.534, "step": 1047 }, { "epoch": 1.8418277680140598, "grad_norm": 1.045803961573671, "learning_rate": 6.370952889431593e-05, "loss": 0.5496, "step": 1048 }, { "epoch": 1.843585237258348, "grad_norm": 0.9353865818541721, "learning_rate": 6.366997630761372e-05, "loss": 0.545, "step": 1049 }, { "epoch": 1.8453427065026362, "grad_norm": 0.7297785036272865, "learning_rate": 6.363038807435499e-05, "loss": 0.5539, "step": 1050 }, { "epoch": 1.8471001757469243, "grad_norm": 0.5201539088546266, "learning_rate": 6.359076425415884e-05, "loss": 0.5699, "step": 1051 }, { "epoch": 1.8488576449912126, "grad_norm": 0.36269628271588045, "learning_rate": 6.355110490669808e-05, "loss": 0.5424, "step": 1052 }, { "epoch": 1.850615114235501, "grad_norm": 0.47154412625270636, "learning_rate": 6.351141009169893e-05, "loss": 0.5448, "step": 1053 }, { "epoch": 1.852372583479789, "grad_norm": 0.575452264797244, "learning_rate": 6.34716798689411e-05, "loss": 0.5458, "step": 1054 }, { "epoch": 1.8541300527240774, "grad_norm": 0.5327085778669232, "learning_rate": 6.343191429825756e-05, "loss": 0.5541, "step": 1055 }, { "epoch": 1.8558875219683655, "grad_norm": 0.4875828784558593, "learning_rate": 6.339211343953456e-05, "loss": 0.5316, "step": 1056 }, { "epoch": 1.8576449912126538, "grad_norm": 0.43543464866935067, "learning_rate": 6.335227735271146e-05, "loss": 0.5406, "step": 1057 }, { "epoch": 1.8594024604569421, "grad_norm": 0.31621824930888837, "learning_rate": 6.33124060977807e-05, "loss": 0.5459, "step": 1058 }, { "epoch": 1.8611599297012302, "grad_norm": 0.3348437950273745, "learning_rate": 6.327249973478765e-05, "loss": 0.5366, "step": 1059 }, { "epoch": 1.8629173989455183, "grad_norm": 0.4492727984235683, "learning_rate": 6.323255832383059e-05, "loss": 0.5506, "step": 1060 }, { "epoch": 1.8646748681898067, "grad_norm": 0.5074563778477226, "learning_rate": 6.319258192506056e-05, "loss": 0.5502, "step": 1061 }, { "epoch": 1.866432337434095, "grad_norm": 0.5557627131939148, "learning_rate": 6.315257059868128e-05, "loss": 0.5399, "step": 1062 }, { "epoch": 1.868189806678383, "grad_norm": 0.5563902754618347, "learning_rate": 6.31125244049491e-05, "loss": 0.5357, "step": 1063 }, { "epoch": 1.8699472759226714, "grad_norm": 0.5559536679183099, "learning_rate": 6.307244340417286e-05, "loss": 0.5341, "step": 1064 }, { "epoch": 1.8717047451669595, "grad_norm": 0.6474104913979678, "learning_rate": 6.303232765671383e-05, "loss": 0.5552, "step": 1065 }, { "epoch": 1.8734622144112478, "grad_norm": 0.6431492001401354, "learning_rate": 6.299217722298558e-05, "loss": 0.5495, "step": 1066 }, { "epoch": 1.8752196836555362, "grad_norm": 0.5168188346690431, "learning_rate": 6.295199216345393e-05, "loss": 0.5315, "step": 1067 }, { "epoch": 1.8769771528998243, "grad_norm": 0.47202508190196646, "learning_rate": 6.291177253863691e-05, "loss": 0.5309, "step": 1068 }, { "epoch": 1.8787346221441124, "grad_norm": 0.46432904749123755, "learning_rate": 6.287151840910449e-05, "loss": 0.5418, "step": 1069 }, { "epoch": 1.8804920913884007, "grad_norm": 0.3734174693561488, "learning_rate": 6.28312298354787e-05, "loss": 0.5284, "step": 1070 }, { "epoch": 1.882249560632689, "grad_norm": 0.3342592395154356, "learning_rate": 6.279090687843338e-05, "loss": 0.5437, "step": 1071 }, { "epoch": 1.884007029876977, "grad_norm": 0.40849810672756376, "learning_rate": 6.275054959869417e-05, "loss": 0.5529, "step": 1072 }, { "epoch": 1.8857644991212654, "grad_norm": 0.5053731572406688, "learning_rate": 6.271015805703843e-05, "loss": 0.5455, "step": 1073 }, { "epoch": 1.8875219683655535, "grad_norm": 0.46319085870371807, "learning_rate": 6.266973231429507e-05, "loss": 0.5402, "step": 1074 }, { "epoch": 1.8892794376098418, "grad_norm": 0.4964608551972014, "learning_rate": 6.262927243134453e-05, "loss": 0.5258, "step": 1075 }, { "epoch": 1.8910369068541302, "grad_norm": 0.5987021542347749, "learning_rate": 6.258877846911868e-05, "loss": 0.549, "step": 1076 }, { "epoch": 1.8927943760984183, "grad_norm": 0.6236136769114576, "learning_rate": 6.254825048860067e-05, "loss": 0.5453, "step": 1077 }, { "epoch": 1.8945518453427064, "grad_norm": 0.5966051398799642, "learning_rate": 6.250768855082493e-05, "loss": 0.541, "step": 1078 }, { "epoch": 1.8963093145869947, "grad_norm": 0.5127318189933557, "learning_rate": 6.246709271687699e-05, "loss": 0.5486, "step": 1079 }, { "epoch": 1.898066783831283, "grad_norm": 0.5294651677314373, "learning_rate": 6.242646304789345e-05, "loss": 0.5408, "step": 1080 }, { "epoch": 1.8998242530755711, "grad_norm": 0.521760159388788, "learning_rate": 6.238579960506184e-05, "loss": 0.5363, "step": 1081 }, { "epoch": 1.9015817223198594, "grad_norm": 0.4064887151441861, "learning_rate": 6.23451024496206e-05, "loss": 0.5373, "step": 1082 }, { "epoch": 1.9033391915641475, "grad_norm": 0.39124094621852334, "learning_rate": 6.230437164285888e-05, "loss": 0.5444, "step": 1083 }, { "epoch": 1.9050966608084359, "grad_norm": 0.44878310088047174, "learning_rate": 6.226360724611657e-05, "loss": 0.5343, "step": 1084 }, { "epoch": 1.9068541300527242, "grad_norm": 0.4416751741745162, "learning_rate": 6.22228093207841e-05, "loss": 0.5422, "step": 1085 }, { "epoch": 1.9086115992970123, "grad_norm": 0.35905217299976805, "learning_rate": 6.218197792830239e-05, "loss": 0.5359, "step": 1086 }, { "epoch": 1.9103690685413004, "grad_norm": 0.39595552560389896, "learning_rate": 6.21411131301628e-05, "loss": 0.5509, "step": 1087 }, { "epoch": 1.9121265377855887, "grad_norm": 0.4995450106154974, "learning_rate": 6.210021498790698e-05, "loss": 0.5349, "step": 1088 }, { "epoch": 1.913884007029877, "grad_norm": 0.6246168588788666, "learning_rate": 6.205928356312678e-05, "loss": 0.5478, "step": 1089 }, { "epoch": 1.9156414762741654, "grad_norm": 0.6681550205581566, "learning_rate": 6.20183189174642e-05, "loss": 0.5455, "step": 1090 }, { "epoch": 1.9173989455184535, "grad_norm": 0.7319209192833181, "learning_rate": 6.197732111261124e-05, "loss": 0.544, "step": 1091 }, { "epoch": 1.9191564147627416, "grad_norm": 0.8799685866875524, "learning_rate": 6.193629021030987e-05, "loss": 0.5398, "step": 1092 }, { "epoch": 1.92091388400703, "grad_norm": 1.0530199193148286, "learning_rate": 6.189522627235188e-05, "loss": 0.5478, "step": 1093 }, { "epoch": 1.9226713532513182, "grad_norm": 0.9307557165593965, "learning_rate": 6.18541293605788e-05, "loss": 0.5264, "step": 1094 }, { "epoch": 1.9244288224956063, "grad_norm": 0.658232049581039, "learning_rate": 6.181299953688186e-05, "loss": 0.5501, "step": 1095 }, { "epoch": 1.9261862917398944, "grad_norm": 0.5081941419696486, "learning_rate": 6.177183686320184e-05, "loss": 0.5328, "step": 1096 }, { "epoch": 1.9279437609841827, "grad_norm": 0.3518241110213672, "learning_rate": 6.173064140152894e-05, "loss": 0.5303, "step": 1097 }, { "epoch": 1.929701230228471, "grad_norm": 0.3697019446691154, "learning_rate": 6.168941321390284e-05, "loss": 0.5247, "step": 1098 }, { "epoch": 1.9314586994727594, "grad_norm": 0.5413405990755409, "learning_rate": 6.164815236241239e-05, "loss": 0.533, "step": 1099 }, { "epoch": 1.9332161687170475, "grad_norm": 0.5914077910836915, "learning_rate": 6.160685890919573e-05, "loss": 0.5408, "step": 1100 }, { "epoch": 1.9349736379613356, "grad_norm": 0.5046062726446828, "learning_rate": 6.156553291644002e-05, "loss": 0.5459, "step": 1101 }, { "epoch": 1.936731107205624, "grad_norm": 0.47111598847173636, "learning_rate": 6.152417444638147e-05, "loss": 0.5352, "step": 1102 }, { "epoch": 1.9384885764499122, "grad_norm": 0.4926035413184022, "learning_rate": 6.14827835613052e-05, "loss": 0.5509, "step": 1103 }, { "epoch": 1.9402460456942003, "grad_norm": 0.6266449148379016, "learning_rate": 6.144136032354513e-05, "loss": 0.5398, "step": 1104 }, { "epoch": 1.9420035149384884, "grad_norm": 0.8384529343873982, "learning_rate": 6.13999047954839e-05, "loss": 0.5454, "step": 1105 }, { "epoch": 1.9437609841827768, "grad_norm": 0.9922033911375286, "learning_rate": 6.135841703955278e-05, "loss": 0.5357, "step": 1106 }, { "epoch": 1.945518453427065, "grad_norm": 0.9314579781737665, "learning_rate": 6.131689711823161e-05, "loss": 0.5328, "step": 1107 }, { "epoch": 1.9472759226713534, "grad_norm": 0.7558159414737873, "learning_rate": 6.12753450940486e-05, "loss": 0.5368, "step": 1108 }, { "epoch": 1.9490333919156415, "grad_norm": 0.5355359628417511, "learning_rate": 6.123376102958038e-05, "loss": 0.5416, "step": 1109 }, { "epoch": 1.9507908611599296, "grad_norm": 0.3245727013118568, "learning_rate": 6.11921449874518e-05, "loss": 0.5404, "step": 1110 }, { "epoch": 1.952548330404218, "grad_norm": 0.290269016414199, "learning_rate": 6.115049703033584e-05, "loss": 0.5336, "step": 1111 }, { "epoch": 1.9543057996485063, "grad_norm": 0.5551422204348799, "learning_rate": 6.110881722095359e-05, "loss": 0.5409, "step": 1112 }, { "epoch": 1.9560632688927944, "grad_norm": 0.743316280603737, "learning_rate": 6.106710562207409e-05, "loss": 0.5447, "step": 1113 }, { "epoch": 1.9578207381370825, "grad_norm": 0.6592768614127914, "learning_rate": 6.102536229651424e-05, "loss": 0.5467, "step": 1114 }, { "epoch": 1.9595782073813708, "grad_norm": 0.36340609503544774, "learning_rate": 6.0983587307138756e-05, "loss": 0.5257, "step": 1115 }, { "epoch": 1.961335676625659, "grad_norm": 0.329039807660089, "learning_rate": 6.094178071686001e-05, "loss": 0.54, "step": 1116 }, { "epoch": 1.9630931458699474, "grad_norm": 0.36903654231530836, "learning_rate": 6.0899942588637955e-05, "loss": 0.5344, "step": 1117 }, { "epoch": 1.9648506151142355, "grad_norm": 0.464788269690655, "learning_rate": 6.085807298548008e-05, "loss": 0.5405, "step": 1118 }, { "epoch": 1.9666080843585236, "grad_norm": 0.5317887103180102, "learning_rate": 6.081617197044123e-05, "loss": 0.537, "step": 1119 }, { "epoch": 1.968365553602812, "grad_norm": 0.4720758440243803, "learning_rate": 6.077423960662359e-05, "loss": 0.5396, "step": 1120 }, { "epoch": 1.9701230228471003, "grad_norm": 0.3461433015280955, "learning_rate": 6.073227595717653e-05, "loss": 0.5313, "step": 1121 }, { "epoch": 1.9718804920913884, "grad_norm": 0.42471674794655867, "learning_rate": 6.069028108529656e-05, "loss": 0.5394, "step": 1122 }, { "epoch": 1.9736379613356765, "grad_norm": 0.5690448956148386, "learning_rate": 6.06482550542272e-05, "loss": 0.5578, "step": 1123 }, { "epoch": 1.9753954305799648, "grad_norm": 0.5227386690694226, "learning_rate": 6.0606197927258874e-05, "loss": 0.5192, "step": 1124 }, { "epoch": 1.9771528998242531, "grad_norm": 0.34882490766654056, "learning_rate": 6.056410976772887e-05, "loss": 0.5469, "step": 1125 }, { "epoch": 1.9789103690685415, "grad_norm": 0.40364533111093664, "learning_rate": 6.05219906390212e-05, "loss": 0.5465, "step": 1126 }, { "epoch": 1.9806678383128296, "grad_norm": 0.4146944275760814, "learning_rate": 6.0479840604566506e-05, "loss": 0.5412, "step": 1127 }, { "epoch": 1.9824253075571177, "grad_norm": 0.3778056451772718, "learning_rate": 6.0437659727841974e-05, "loss": 0.5448, "step": 1128 }, { "epoch": 1.984182776801406, "grad_norm": 0.3084658621492258, "learning_rate": 6.039544807237123e-05, "loss": 0.5336, "step": 1129 }, { "epoch": 1.9859402460456943, "grad_norm": 0.29727729662548175, "learning_rate": 6.0353205701724286e-05, "loss": 0.5445, "step": 1130 }, { "epoch": 1.9876977152899824, "grad_norm": 0.3057317745510216, "learning_rate": 6.031093267951738e-05, "loss": 0.5411, "step": 1131 }, { "epoch": 1.9894551845342705, "grad_norm": 0.3358845944229429, "learning_rate": 6.0268629069412934e-05, "loss": 0.5287, "step": 1132 }, { "epoch": 1.9912126537785588, "grad_norm": 0.32476986227751065, "learning_rate": 6.022629493511939e-05, "loss": 0.5536, "step": 1133 }, { "epoch": 1.9929701230228472, "grad_norm": 0.3609680153890359, "learning_rate": 6.018393034039121e-05, "loss": 0.5401, "step": 1134 }, { "epoch": 1.9947275922671355, "grad_norm": 0.38237892201389073, "learning_rate": 6.014153534902871e-05, "loss": 0.5378, "step": 1135 }, { "epoch": 1.9964850615114236, "grad_norm": 0.4086273005804138, "learning_rate": 6.009911002487797e-05, "loss": 0.5481, "step": 1136 }, { "epoch": 1.9982425307557117, "grad_norm": 0.3800987146874098, "learning_rate": 6.0056654431830764e-05, "loss": 0.5405, "step": 1137 }, { "epoch": 2.0, "grad_norm": 0.42413844362645664, "learning_rate": 6.001416863382446e-05, "loss": 0.5272, "step": 1138 }, { "epoch": 2.0017574692442883, "grad_norm": 0.4662395595485263, "learning_rate": 5.9971652694841885e-05, "loss": 0.5173, "step": 1139 }, { "epoch": 2.0035149384885766, "grad_norm": 0.519748001464098, "learning_rate": 5.992910667891128e-05, "loss": 0.5279, "step": 1140 }, { "epoch": 2.0052724077328645, "grad_norm": 0.6155479948166603, "learning_rate": 5.988653065010618e-05, "loss": 0.5229, "step": 1141 }, { "epoch": 2.007029876977153, "grad_norm": 0.5102286075141246, "learning_rate": 5.9843924672545316e-05, "loss": 0.5173, "step": 1142 }, { "epoch": 2.008787346221441, "grad_norm": 0.6971842256552616, "learning_rate": 5.980128881039252e-05, "loss": 0.5197, "step": 1143 }, { "epoch": 2.0105448154657295, "grad_norm": 0.769707289887016, "learning_rate": 5.975862312785661e-05, "loss": 0.514, "step": 1144 }, { "epoch": 2.0123022847100174, "grad_norm": 0.695232442358851, "learning_rate": 5.971592768919137e-05, "loss": 0.5305, "step": 1145 }, { "epoch": 2.0140597539543057, "grad_norm": 0.752339343246401, "learning_rate": 5.967320255869533e-05, "loss": 0.514, "step": 1146 }, { "epoch": 2.015817223198594, "grad_norm": 0.7669941732308564, "learning_rate": 5.963044780071177e-05, "loss": 0.5234, "step": 1147 }, { "epoch": 2.0175746924428823, "grad_norm": 0.6848690413821964, "learning_rate": 5.9587663479628596e-05, "loss": 0.5336, "step": 1148 }, { "epoch": 2.0193321616871707, "grad_norm": 0.530957431582637, "learning_rate": 5.9544849659878206e-05, "loss": 0.5184, "step": 1149 }, { "epoch": 2.0210896309314585, "grad_norm": 0.31166363719456375, "learning_rate": 5.950200640593745e-05, "loss": 0.517, "step": 1150 }, { "epoch": 2.022847100175747, "grad_norm": 0.502820661545691, "learning_rate": 5.9459133782327475e-05, "loss": 0.5187, "step": 1151 }, { "epoch": 2.024604569420035, "grad_norm": 0.6631645722375302, "learning_rate": 5.941623185361371e-05, "loss": 0.5136, "step": 1152 }, { "epoch": 2.0263620386643235, "grad_norm": 0.5958460669390597, "learning_rate": 5.937330068440565e-05, "loss": 0.5144, "step": 1153 }, { "epoch": 2.0281195079086114, "grad_norm": 0.5652748559494456, "learning_rate": 5.933034033935689e-05, "loss": 0.5218, "step": 1154 }, { "epoch": 2.0298769771528997, "grad_norm": 0.5713785167554606, "learning_rate": 5.928735088316492e-05, "loss": 0.5253, "step": 1155 }, { "epoch": 2.031634446397188, "grad_norm": 0.44090539920953903, "learning_rate": 5.9244332380571075e-05, "loss": 0.5175, "step": 1156 }, { "epoch": 2.0333919156414764, "grad_norm": 0.4587467544198981, "learning_rate": 5.9201284896360454e-05, "loss": 0.5227, "step": 1157 }, { "epoch": 2.0351493848857647, "grad_norm": 0.45379128789769424, "learning_rate": 5.915820849536179e-05, "loss": 0.5119, "step": 1158 }, { "epoch": 2.0369068541300526, "grad_norm": 0.3512454845046946, "learning_rate": 5.9115103242447346e-05, "loss": 0.5176, "step": 1159 }, { "epoch": 2.038664323374341, "grad_norm": 0.40008112881312674, "learning_rate": 5.90719692025329e-05, "loss": 0.5085, "step": 1160 }, { "epoch": 2.040421792618629, "grad_norm": 0.38234047491704104, "learning_rate": 5.902880644057749e-05, "loss": 0.5149, "step": 1161 }, { "epoch": 2.0421792618629175, "grad_norm": 0.6400716539471412, "learning_rate": 5.8985615021583464e-05, "loss": 0.5227, "step": 1162 }, { "epoch": 2.0439367311072054, "grad_norm": 0.7325588988779584, "learning_rate": 5.8942395010596334e-05, "loss": 0.5262, "step": 1163 }, { "epoch": 2.0456942003514937, "grad_norm": 0.6846974311519511, "learning_rate": 5.889914647270464e-05, "loss": 0.5216, "step": 1164 }, { "epoch": 2.047451669595782, "grad_norm": 0.6740065780192805, "learning_rate": 5.8855869473039924e-05, "loss": 0.5069, "step": 1165 }, { "epoch": 2.0492091388400704, "grad_norm": 0.7538611600746228, "learning_rate": 5.8812564076776525e-05, "loss": 0.5147, "step": 1166 }, { "epoch": 2.0509666080843587, "grad_norm": 0.8127203401347771, "learning_rate": 5.876923034913162e-05, "loss": 0.5133, "step": 1167 }, { "epoch": 2.0527240773286466, "grad_norm": 0.7617695074199529, "learning_rate": 5.872586835536501e-05, "loss": 0.5227, "step": 1168 }, { "epoch": 2.054481546572935, "grad_norm": 0.7051767250276326, "learning_rate": 5.868247816077906e-05, "loss": 0.5307, "step": 1169 }, { "epoch": 2.0562390158172232, "grad_norm": 0.5635365115270297, "learning_rate": 5.863905983071865e-05, "loss": 0.5202, "step": 1170 }, { "epoch": 2.0579964850615116, "grad_norm": 0.41716634081291104, "learning_rate": 5.859561343057099e-05, "loss": 0.5053, "step": 1171 }, { "epoch": 2.0597539543057994, "grad_norm": 0.4624744578376148, "learning_rate": 5.855213902576554e-05, "loss": 0.5157, "step": 1172 }, { "epoch": 2.0615114235500878, "grad_norm": 0.5256751401338193, "learning_rate": 5.8508636681774017e-05, "loss": 0.5211, "step": 1173 }, { "epoch": 2.063268892794376, "grad_norm": 0.5250723504805005, "learning_rate": 5.8465106464110114e-05, "loss": 0.5144, "step": 1174 }, { "epoch": 2.0650263620386644, "grad_norm": 0.5034845587692587, "learning_rate": 5.8421548438329604e-05, "loss": 0.5134, "step": 1175 }, { "epoch": 2.0667838312829527, "grad_norm": 0.4619845199239309, "learning_rate": 5.8377962670030056e-05, "loss": 0.519, "step": 1176 }, { "epoch": 2.0685413005272406, "grad_norm": 0.3856674881383062, "learning_rate": 5.8334349224850854e-05, "loss": 0.5256, "step": 1177 }, { "epoch": 2.070298769771529, "grad_norm": 0.3996314933676772, "learning_rate": 5.829070816847307e-05, "loss": 0.5189, "step": 1178 }, { "epoch": 2.0720562390158173, "grad_norm": 0.4176584105508727, "learning_rate": 5.824703956661932e-05, "loss": 0.5122, "step": 1179 }, { "epoch": 2.0738137082601056, "grad_norm": 0.3912437439240953, "learning_rate": 5.820334348505376e-05, "loss": 0.5195, "step": 1180 }, { "epoch": 2.0755711775043935, "grad_norm": 0.377701731660329, "learning_rate": 5.8159619989581874e-05, "loss": 0.5304, "step": 1181 }, { "epoch": 2.077328646748682, "grad_norm": 0.4580901316534192, "learning_rate": 5.8115869146050475e-05, "loss": 0.5214, "step": 1182 }, { "epoch": 2.07908611599297, "grad_norm": 0.4598207182226081, "learning_rate": 5.807209102034753e-05, "loss": 0.5204, "step": 1183 }, { "epoch": 2.0808435852372584, "grad_norm": 0.4014437395458196, "learning_rate": 5.8028285678402105e-05, "loss": 0.5175, "step": 1184 }, { "epoch": 2.0826010544815468, "grad_norm": 0.3783210629678677, "learning_rate": 5.798445318618427e-05, "loss": 0.5174, "step": 1185 }, { "epoch": 2.0843585237258346, "grad_norm": 0.41227675683261644, "learning_rate": 5.794059360970495e-05, "loss": 0.5288, "step": 1186 }, { "epoch": 2.086115992970123, "grad_norm": 0.5048356415215696, "learning_rate": 5.789670701501587e-05, "loss": 0.523, "step": 1187 }, { "epoch": 2.0878734622144113, "grad_norm": 0.4857444235659286, "learning_rate": 5.7852793468209466e-05, "loss": 0.5168, "step": 1188 }, { "epoch": 2.0896309314586996, "grad_norm": 0.4043149526798999, "learning_rate": 5.7808853035418736e-05, "loss": 0.5099, "step": 1189 }, { "epoch": 2.0913884007029875, "grad_norm": 0.3690429316556121, "learning_rate": 5.7764885782817176e-05, "loss": 0.5041, "step": 1190 }, { "epoch": 2.093145869947276, "grad_norm": 0.38813525676786204, "learning_rate": 5.772089177661867e-05, "loss": 0.52, "step": 1191 }, { "epoch": 2.094903339191564, "grad_norm": 0.3028251292939506, "learning_rate": 5.7676871083077416e-05, "loss": 0.5233, "step": 1192 }, { "epoch": 2.0966608084358525, "grad_norm": 0.3234004733575296, "learning_rate": 5.763282376848777e-05, "loss": 0.5304, "step": 1193 }, { "epoch": 2.0984182776801408, "grad_norm": 0.31909876903755646, "learning_rate": 5.758874989918418e-05, "loss": 0.5227, "step": 1194 }, { "epoch": 2.1001757469244287, "grad_norm": 0.3567907166218323, "learning_rate": 5.7544649541541115e-05, "loss": 0.5201, "step": 1195 }, { "epoch": 2.101933216168717, "grad_norm": 0.42859797362285246, "learning_rate": 5.7500522761972906e-05, "loss": 0.5137, "step": 1196 }, { "epoch": 2.1036906854130053, "grad_norm": 0.4475934837176111, "learning_rate": 5.745636962693368e-05, "loss": 0.5312, "step": 1197 }, { "epoch": 2.1054481546572936, "grad_norm": 0.4652247554602312, "learning_rate": 5.741219020291729e-05, "loss": 0.5163, "step": 1198 }, { "epoch": 2.1072056239015815, "grad_norm": 0.47146545758604963, "learning_rate": 5.736798455645712e-05, "loss": 0.5173, "step": 1199 }, { "epoch": 2.10896309314587, "grad_norm": 0.5163522876596625, "learning_rate": 5.732375275412608e-05, "loss": 0.5227, "step": 1200 }, { "epoch": 2.110720562390158, "grad_norm": 0.5006986542880699, "learning_rate": 5.727949486253648e-05, "loss": 0.5147, "step": 1201 }, { "epoch": 2.1124780316344465, "grad_norm": 0.4590579034543106, "learning_rate": 5.723521094833988e-05, "loss": 0.5183, "step": 1202 }, { "epoch": 2.114235500878735, "grad_norm": 0.4498682329686094, "learning_rate": 5.719090107822708e-05, "loss": 0.526, "step": 1203 }, { "epoch": 2.1159929701230227, "grad_norm": 0.4372003345767447, "learning_rate": 5.714656531892793e-05, "loss": 0.5229, "step": 1204 }, { "epoch": 2.117750439367311, "grad_norm": 0.3617713027608036, "learning_rate": 5.710220373721129e-05, "loss": 0.5224, "step": 1205 }, { "epoch": 2.1195079086115993, "grad_norm": 0.3775680189241547, "learning_rate": 5.7057816399884896e-05, "loss": 0.5112, "step": 1206 }, { "epoch": 2.1212653778558876, "grad_norm": 0.3356180282710574, "learning_rate": 5.701340337379526e-05, "loss": 0.5088, "step": 1207 }, { "epoch": 2.1230228471001755, "grad_norm": 0.2623912737173368, "learning_rate": 5.6968964725827626e-05, "loss": 0.5261, "step": 1208 }, { "epoch": 2.124780316344464, "grad_norm": 0.3618983021670561, "learning_rate": 5.692450052290577e-05, "loss": 0.5246, "step": 1209 }, { "epoch": 2.126537785588752, "grad_norm": 0.4563194889014405, "learning_rate": 5.6880010831992e-05, "loss": 0.5119, "step": 1210 }, { "epoch": 2.1282952548330405, "grad_norm": 0.46577649572149055, "learning_rate": 5.683549572008696e-05, "loss": 0.5179, "step": 1211 }, { "epoch": 2.130052724077329, "grad_norm": 0.4957682050050259, "learning_rate": 5.679095525422963e-05, "loss": 0.5298, "step": 1212 }, { "epoch": 2.1318101933216167, "grad_norm": 0.4692073298968053, "learning_rate": 5.674638950149713e-05, "loss": 0.5029, "step": 1213 }, { "epoch": 2.133567662565905, "grad_norm": 0.3607574745106175, "learning_rate": 5.6701798529004684e-05, "loss": 0.4975, "step": 1214 }, { "epoch": 2.1353251318101933, "grad_norm": 0.4688081081189253, "learning_rate": 5.66571824039055e-05, "loss": 0.5234, "step": 1215 }, { "epoch": 2.1370826010544817, "grad_norm": 0.42203851122665004, "learning_rate": 5.661254119339064e-05, "loss": 0.5223, "step": 1216 }, { "epoch": 2.1388400702987695, "grad_norm": 0.5285428516818359, "learning_rate": 5.6567874964688954e-05, "loss": 0.5122, "step": 1217 }, { "epoch": 2.140597539543058, "grad_norm": 0.7669909366672123, "learning_rate": 5.6523183785067007e-05, "loss": 0.5152, "step": 1218 }, { "epoch": 2.142355008787346, "grad_norm": 0.8013955334301147, "learning_rate": 5.6478467721828875e-05, "loss": 0.5193, "step": 1219 }, { "epoch": 2.1441124780316345, "grad_norm": 0.6734233457925337, "learning_rate": 5.643372684231616e-05, "loss": 0.5234, "step": 1220 }, { "epoch": 2.145869947275923, "grad_norm": 0.6198127010821751, "learning_rate": 5.638896121390782e-05, "loss": 0.5166, "step": 1221 }, { "epoch": 2.1476274165202107, "grad_norm": 0.5101306681349663, "learning_rate": 5.634417090402007e-05, "loss": 0.5178, "step": 1222 }, { "epoch": 2.149384885764499, "grad_norm": 0.30811275329951254, "learning_rate": 5.629935598010631e-05, "loss": 0.5184, "step": 1223 }, { "epoch": 2.1511423550087874, "grad_norm": 0.4145767943110988, "learning_rate": 5.6254516509657006e-05, "loss": 0.5094, "step": 1224 }, { "epoch": 2.1528998242530757, "grad_norm": 0.5450800704696313, "learning_rate": 5.620965256019959e-05, "loss": 0.5202, "step": 1225 }, { "epoch": 2.1546572934973636, "grad_norm": 0.5786542256574648, "learning_rate": 5.616476419929838e-05, "loss": 0.5178, "step": 1226 }, { "epoch": 2.156414762741652, "grad_norm": 0.5230174770303593, "learning_rate": 5.61198514945544e-05, "loss": 0.5246, "step": 1227 }, { "epoch": 2.15817223198594, "grad_norm": 0.5420146254394205, "learning_rate": 5.60749145136054e-05, "loss": 0.5174, "step": 1228 }, { "epoch": 2.1599297012302285, "grad_norm": 0.43190859292121825, "learning_rate": 5.6029953324125636e-05, "loss": 0.5069, "step": 1229 }, { "epoch": 2.161687170474517, "grad_norm": 0.34487724717098595, "learning_rate": 5.598496799382588e-05, "loss": 0.5226, "step": 1230 }, { "epoch": 2.1634446397188047, "grad_norm": 0.3811843390216026, "learning_rate": 5.5939958590453205e-05, "loss": 0.5079, "step": 1231 }, { "epoch": 2.165202108963093, "grad_norm": 0.5052503704631418, "learning_rate": 5.5894925181790944e-05, "loss": 0.5219, "step": 1232 }, { "epoch": 2.1669595782073814, "grad_norm": 0.522344840164043, "learning_rate": 5.584986783565863e-05, "loss": 0.5108, "step": 1233 }, { "epoch": 2.1687170474516697, "grad_norm": 0.5280236465562265, "learning_rate": 5.580478661991179e-05, "loss": 0.5176, "step": 1234 }, { "epoch": 2.1704745166959576, "grad_norm": 0.44750008557626464, "learning_rate": 5.5759681602441933e-05, "loss": 0.5157, "step": 1235 }, { "epoch": 2.172231985940246, "grad_norm": 0.3803796708510697, "learning_rate": 5.571455285117639e-05, "loss": 0.5196, "step": 1236 }, { "epoch": 2.1739894551845342, "grad_norm": 0.3564117743293797, "learning_rate": 5.566940043407824e-05, "loss": 0.5183, "step": 1237 }, { "epoch": 2.1757469244288226, "grad_norm": 0.33664717447067205, "learning_rate": 5.562422441914623e-05, "loss": 0.5119, "step": 1238 }, { "epoch": 2.177504393673111, "grad_norm": 0.34231686521123766, "learning_rate": 5.55790248744146e-05, "loss": 0.5118, "step": 1239 }, { "epoch": 2.1792618629173988, "grad_norm": 0.33635343346173696, "learning_rate": 5.553380186795306e-05, "loss": 0.5189, "step": 1240 }, { "epoch": 2.181019332161687, "grad_norm": 0.32130478345716623, "learning_rate": 5.548855546786663e-05, "loss": 0.5142, "step": 1241 }, { "epoch": 2.1827768014059754, "grad_norm": 0.3153724883761827, "learning_rate": 5.544328574229557e-05, "loss": 0.5085, "step": 1242 }, { "epoch": 2.1845342706502637, "grad_norm": 0.30170207661498855, "learning_rate": 5.53979927594153e-05, "loss": 0.5069, "step": 1243 }, { "epoch": 2.1862917398945516, "grad_norm": 0.31259530762147886, "learning_rate": 5.5352676587436184e-05, "loss": 0.5199, "step": 1244 }, { "epoch": 2.18804920913884, "grad_norm": 0.36830179423516135, "learning_rate": 5.5307337294603595e-05, "loss": 0.5145, "step": 1245 }, { "epoch": 2.1898066783831283, "grad_norm": 0.40458599866256023, "learning_rate": 5.526197494919768e-05, "loss": 0.5168, "step": 1246 }, { "epoch": 2.1915641476274166, "grad_norm": 0.37455347026427915, "learning_rate": 5.521658961953329e-05, "loss": 0.5353, "step": 1247 }, { "epoch": 2.193321616871705, "grad_norm": 0.3090432751098944, "learning_rate": 5.5171181373959956e-05, "loss": 0.5205, "step": 1248 }, { "epoch": 2.195079086115993, "grad_norm": 0.2712878361532458, "learning_rate": 5.512575028086162e-05, "loss": 0.5046, "step": 1249 }, { "epoch": 2.196836555360281, "grad_norm": 0.32802375244112514, "learning_rate": 5.5080296408656735e-05, "loss": 0.5218, "step": 1250 }, { "epoch": 2.1985940246045694, "grad_norm": 0.32019830720460934, "learning_rate": 5.503481982579799e-05, "loss": 0.516, "step": 1251 }, { "epoch": 2.2003514938488578, "grad_norm": 0.6643839428666275, "learning_rate": 5.498932060077229e-05, "loss": 0.5208, "step": 1252 }, { "epoch": 2.202108963093146, "grad_norm": 0.32312410452337403, "learning_rate": 5.494379880210066e-05, "loss": 0.5119, "step": 1253 }, { "epoch": 2.203866432337434, "grad_norm": 0.27490085730001595, "learning_rate": 5.4898254498338096e-05, "loss": 0.5218, "step": 1254 }, { "epoch": 2.2056239015817223, "grad_norm": 0.3865785860618901, "learning_rate": 5.485268775807351e-05, "loss": 0.516, "step": 1255 }, { "epoch": 2.2073813708260106, "grad_norm": 0.40183780154659454, "learning_rate": 5.480709864992958e-05, "loss": 0.5233, "step": 1256 }, { "epoch": 2.209138840070299, "grad_norm": 0.3794737226460968, "learning_rate": 5.476148724256266e-05, "loss": 0.5075, "step": 1257 }, { "epoch": 2.210896309314587, "grad_norm": 0.35481698075385854, "learning_rate": 5.4715853604662744e-05, "loss": 0.5156, "step": 1258 }, { "epoch": 2.212653778558875, "grad_norm": 0.5007174895420912, "learning_rate": 5.4670197804953234e-05, "loss": 0.5189, "step": 1259 }, { "epoch": 2.2144112478031635, "grad_norm": 0.5082764046513851, "learning_rate": 5.462451991219096e-05, "loss": 0.5175, "step": 1260 }, { "epoch": 2.2161687170474518, "grad_norm": 0.42140040357213104, "learning_rate": 5.4578819995166005e-05, "loss": 0.5292, "step": 1261 }, { "epoch": 2.21792618629174, "grad_norm": 0.36269956507890694, "learning_rate": 5.453309812270158e-05, "loss": 0.5324, "step": 1262 }, { "epoch": 2.219683655536028, "grad_norm": 0.3655508333331778, "learning_rate": 5.4487354363654037e-05, "loss": 0.5093, "step": 1263 }, { "epoch": 2.2214411247803163, "grad_norm": 0.4177461021569192, "learning_rate": 5.444158878691265e-05, "loss": 0.5169, "step": 1264 }, { "epoch": 2.2231985940246046, "grad_norm": 0.42306349704599266, "learning_rate": 5.4395801461399536e-05, "loss": 0.5211, "step": 1265 }, { "epoch": 2.224956063268893, "grad_norm": 0.34244619953503824, "learning_rate": 5.434999245606959e-05, "loss": 0.5096, "step": 1266 }, { "epoch": 2.226713532513181, "grad_norm": 0.3105721917415473, "learning_rate": 5.4304161839910316e-05, "loss": 0.5117, "step": 1267 }, { "epoch": 2.228471001757469, "grad_norm": 0.29317967504509845, "learning_rate": 5.4258309681941835e-05, "loss": 0.5164, "step": 1268 }, { "epoch": 2.2302284710017575, "grad_norm": 0.32501221531108865, "learning_rate": 5.421243605121665e-05, "loss": 0.5255, "step": 1269 }, { "epoch": 2.231985940246046, "grad_norm": 0.3736345198165878, "learning_rate": 5.4166541016819624e-05, "loss": 0.5246, "step": 1270 }, { "epoch": 2.233743409490334, "grad_norm": 0.3052006664671635, "learning_rate": 5.4120624647867845e-05, "loss": 0.5106, "step": 1271 }, { "epoch": 2.235500878734622, "grad_norm": 0.2732632652559558, "learning_rate": 5.407468701351053e-05, "loss": 0.5117, "step": 1272 }, { "epoch": 2.2372583479789103, "grad_norm": 0.2740215246700224, "learning_rate": 5.402872818292894e-05, "loss": 0.5204, "step": 1273 }, { "epoch": 2.2390158172231986, "grad_norm": 0.2545818565323285, "learning_rate": 5.398274822533622e-05, "loss": 0.5215, "step": 1274 }, { "epoch": 2.240773286467487, "grad_norm": 0.2385138161462725, "learning_rate": 5.3936747209977384e-05, "loss": 0.511, "step": 1275 }, { "epoch": 2.242530755711775, "grad_norm": 0.3124069736974948, "learning_rate": 5.389072520612913e-05, "loss": 0.5331, "step": 1276 }, { "epoch": 2.244288224956063, "grad_norm": 0.4069184309791982, "learning_rate": 5.384468228309972e-05, "loss": 0.5207, "step": 1277 }, { "epoch": 2.2460456942003515, "grad_norm": 0.42910992923794566, "learning_rate": 5.379861851022902e-05, "loss": 0.514, "step": 1278 }, { "epoch": 2.24780316344464, "grad_norm": 0.43688976839071375, "learning_rate": 5.3752533956888195e-05, "loss": 0.5222, "step": 1279 }, { "epoch": 2.249560632688928, "grad_norm": 0.4393831210441779, "learning_rate": 5.370642869247978e-05, "loss": 0.5184, "step": 1280 }, { "epoch": 2.251318101933216, "grad_norm": 0.41667732024059967, "learning_rate": 5.3660302786437454e-05, "loss": 0.5295, "step": 1281 }, { "epoch": 2.2530755711775043, "grad_norm": 0.3610189218582503, "learning_rate": 5.361415630822601e-05, "loss": 0.5227, "step": 1282 }, { "epoch": 2.2548330404217927, "grad_norm": 0.3798089060552469, "learning_rate": 5.35679893273412e-05, "loss": 0.5301, "step": 1283 }, { "epoch": 2.256590509666081, "grad_norm": 0.6292039822069969, "learning_rate": 5.352180191330965e-05, "loss": 0.5472, "step": 1284 }, { "epoch": 2.2583479789103693, "grad_norm": 0.5461480682760698, "learning_rate": 5.347559413568881e-05, "loss": 0.5111, "step": 1285 }, { "epoch": 2.260105448154657, "grad_norm": 0.5610376425139553, "learning_rate": 5.342936606406672e-05, "loss": 0.5276, "step": 1286 }, { "epoch": 2.2618629173989455, "grad_norm": 0.5889470680054738, "learning_rate": 5.338311776806205e-05, "loss": 0.5139, "step": 1287 }, { "epoch": 2.263620386643234, "grad_norm": 0.514823192809797, "learning_rate": 5.3336849317323866e-05, "loss": 0.5136, "step": 1288 }, { "epoch": 2.2653778558875217, "grad_norm": 0.5302222562604075, "learning_rate": 5.329056078153163e-05, "loss": 0.5328, "step": 1289 }, { "epoch": 2.26713532513181, "grad_norm": 0.5499194065621615, "learning_rate": 5.324425223039506e-05, "loss": 0.5165, "step": 1290 }, { "epoch": 2.2688927943760984, "grad_norm": 0.5897369130508727, "learning_rate": 5.319792373365398e-05, "loss": 0.5276, "step": 1291 }, { "epoch": 2.2706502636203867, "grad_norm": 0.5810420191134203, "learning_rate": 5.315157536107828e-05, "loss": 0.5157, "step": 1292 }, { "epoch": 2.272407732864675, "grad_norm": 0.3410153071217433, "learning_rate": 5.310520718246775e-05, "loss": 0.5176, "step": 1293 }, { "epoch": 2.2741652021089633, "grad_norm": 0.35084033641725193, "learning_rate": 5.3058819267652036e-05, "loss": 0.521, "step": 1294 }, { "epoch": 2.275922671353251, "grad_norm": 0.3127065864513147, "learning_rate": 5.301241168649053e-05, "loss": 0.514, "step": 1295 }, { "epoch": 2.2776801405975395, "grad_norm": 0.36382743615177227, "learning_rate": 5.296598450887217e-05, "loss": 0.5067, "step": 1296 }, { "epoch": 2.279437609841828, "grad_norm": 0.28306552207653224, "learning_rate": 5.2919537804715454e-05, "loss": 0.5236, "step": 1297 }, { "epoch": 2.281195079086116, "grad_norm": 0.32981386557125797, "learning_rate": 5.287307164396831e-05, "loss": 0.5236, "step": 1298 }, { "epoch": 2.282952548330404, "grad_norm": 0.45599457393210924, "learning_rate": 5.28265860966079e-05, "loss": 0.523, "step": 1299 }, { "epoch": 2.2847100175746924, "grad_norm": 0.4517526091078245, "learning_rate": 5.278008123264063e-05, "loss": 0.5198, "step": 1300 }, { "epoch": 2.2864674868189807, "grad_norm": 0.3591872946970674, "learning_rate": 5.273355712210199e-05, "loss": 0.5135, "step": 1301 }, { "epoch": 2.288224956063269, "grad_norm": 0.32839799647422435, "learning_rate": 5.2687013835056445e-05, "loss": 0.5161, "step": 1302 }, { "epoch": 2.2899824253075574, "grad_norm": 0.3636565694920546, "learning_rate": 5.264045144159736e-05, "loss": 0.5215, "step": 1303 }, { "epoch": 2.2917398945518452, "grad_norm": 0.41722802369714423, "learning_rate": 5.2593870011846836e-05, "loss": 0.5227, "step": 1304 }, { "epoch": 2.2934973637961336, "grad_norm": 0.32143530297693124, "learning_rate": 5.254726961595567e-05, "loss": 0.5166, "step": 1305 }, { "epoch": 2.295254833040422, "grad_norm": 0.37372814461571624, "learning_rate": 5.250065032410321e-05, "loss": 0.5125, "step": 1306 }, { "epoch": 2.29701230228471, "grad_norm": 0.3726015901459237, "learning_rate": 5.2454012206497284e-05, "loss": 0.5193, "step": 1307 }, { "epoch": 2.298769771528998, "grad_norm": 0.4196709100303249, "learning_rate": 5.240735533337405e-05, "loss": 0.5305, "step": 1308 }, { "epoch": 2.3005272407732864, "grad_norm": 0.4402175128046172, "learning_rate": 5.23606797749979e-05, "loss": 0.5133, "step": 1309 }, { "epoch": 2.3022847100175747, "grad_norm": 0.4987871932503848, "learning_rate": 5.23139856016614e-05, "loss": 0.523, "step": 1310 }, { "epoch": 2.304042179261863, "grad_norm": 0.4420599719224267, "learning_rate": 5.226727288368513e-05, "loss": 0.5213, "step": 1311 }, { "epoch": 2.3057996485061514, "grad_norm": 0.37725208831400603, "learning_rate": 5.222054169141758e-05, "loss": 0.5237, "step": 1312 }, { "epoch": 2.3075571177504393, "grad_norm": 0.341650534431525, "learning_rate": 5.2173792095235096e-05, "loss": 0.53, "step": 1313 }, { "epoch": 2.3093145869947276, "grad_norm": 0.33962583646349376, "learning_rate": 5.212702416554173e-05, "loss": 0.5352, "step": 1314 }, { "epoch": 2.311072056239016, "grad_norm": 0.33717034643428406, "learning_rate": 5.208023797276913e-05, "loss": 0.5367, "step": 1315 }, { "epoch": 2.3128295254833042, "grad_norm": 0.5125632512376561, "learning_rate": 5.203343358737647e-05, "loss": 0.5239, "step": 1316 }, { "epoch": 2.314586994727592, "grad_norm": 0.6321003417968413, "learning_rate": 5.198661107985027e-05, "loss": 0.511, "step": 1317 }, { "epoch": 2.3163444639718804, "grad_norm": 0.6803046313044214, "learning_rate": 5.193977052070442e-05, "loss": 0.5247, "step": 1318 }, { "epoch": 2.3181019332161688, "grad_norm": 0.6311634350676925, "learning_rate": 5.189291198047994e-05, "loss": 0.5255, "step": 1319 }, { "epoch": 2.319859402460457, "grad_norm": 0.5524928630929371, "learning_rate": 5.184603552974496e-05, "loss": 0.5136, "step": 1320 }, { "epoch": 2.3216168717047454, "grad_norm": 0.34884272465417787, "learning_rate": 5.1799141239094554e-05, "loss": 0.5166, "step": 1321 }, { "epoch": 2.3233743409490333, "grad_norm": 0.32957985634055553, "learning_rate": 5.175222917915067e-05, "loss": 0.5137, "step": 1322 }, { "epoch": 2.3251318101933216, "grad_norm": 0.3399179702889197, "learning_rate": 5.170529942056203e-05, "loss": 0.526, "step": 1323 }, { "epoch": 2.32688927943761, "grad_norm": 0.3690809272470657, "learning_rate": 5.1658352034004e-05, "loss": 0.5195, "step": 1324 }, { "epoch": 2.3286467486818982, "grad_norm": 0.30448586611665973, "learning_rate": 5.16113870901785e-05, "loss": 0.5183, "step": 1325 }, { "epoch": 2.330404217926186, "grad_norm": 0.31074588471610765, "learning_rate": 5.1564404659813887e-05, "loss": 0.5149, "step": 1326 }, { "epoch": 2.3321616871704745, "grad_norm": 0.35357390104657027, "learning_rate": 5.151740481366485e-05, "loss": 0.5141, "step": 1327 }, { "epoch": 2.3339191564147628, "grad_norm": 0.31753323768767494, "learning_rate": 5.147038762251231e-05, "loss": 0.4984, "step": 1328 }, { "epoch": 2.335676625659051, "grad_norm": 0.37978214267929056, "learning_rate": 5.142335315716329e-05, "loss": 0.5115, "step": 1329 }, { "epoch": 2.3374340949033394, "grad_norm": 0.4261605003626959, "learning_rate": 5.1376301488450874e-05, "loss": 0.5307, "step": 1330 }, { "epoch": 2.3391915641476273, "grad_norm": 0.3415259246746834, "learning_rate": 5.132923268723402e-05, "loss": 0.5293, "step": 1331 }, { "epoch": 2.3409490333919156, "grad_norm": 0.25913649944411027, "learning_rate": 5.128214682439747e-05, "loss": 0.519, "step": 1332 }, { "epoch": 2.342706502636204, "grad_norm": 0.417314455324043, "learning_rate": 5.123504397085169e-05, "loss": 0.518, "step": 1333 }, { "epoch": 2.3444639718804923, "grad_norm": 0.44783627218807637, "learning_rate": 5.1187924197532736e-05, "loss": 0.5204, "step": 1334 }, { "epoch": 2.34622144112478, "grad_norm": 0.48282885528479524, "learning_rate": 5.114078757540213e-05, "loss": 0.5181, "step": 1335 }, { "epoch": 2.3479789103690685, "grad_norm": 0.44246238838745156, "learning_rate": 5.1093634175446776e-05, "loss": 0.5107, "step": 1336 }, { "epoch": 2.349736379613357, "grad_norm": 0.4028213599510058, "learning_rate": 5.1046464068678836e-05, "loss": 0.515, "step": 1337 }, { "epoch": 2.351493848857645, "grad_norm": 0.3156165847354602, "learning_rate": 5.099927732613564e-05, "loss": 0.5141, "step": 1338 }, { "epoch": 2.3532513181019334, "grad_norm": 0.255868687207953, "learning_rate": 5.095207401887958e-05, "loss": 0.5167, "step": 1339 }, { "epoch": 2.3550087873462213, "grad_norm": 0.23897059559825065, "learning_rate": 5.0904854217997965e-05, "loss": 0.5227, "step": 1340 }, { "epoch": 2.3567662565905096, "grad_norm": 0.3300453394346074, "learning_rate": 5.085761799460297e-05, "loss": 0.5151, "step": 1341 }, { "epoch": 2.358523725834798, "grad_norm": 0.4378646090471379, "learning_rate": 5.0810365419831516e-05, "loss": 0.5089, "step": 1342 }, { "epoch": 2.3602811950790863, "grad_norm": 0.5034914813953035, "learning_rate": 5.0763096564845084e-05, "loss": 0.5173, "step": 1343 }, { "epoch": 2.362038664323374, "grad_norm": 0.4670717948887586, "learning_rate": 5.071581150082974e-05, "loss": 0.5167, "step": 1344 }, { "epoch": 2.3637961335676625, "grad_norm": 0.38372564455765995, "learning_rate": 5.066851029899594e-05, "loss": 0.5094, "step": 1345 }, { "epoch": 2.365553602811951, "grad_norm": 0.40436965589753754, "learning_rate": 5.0621193030578435e-05, "loss": 0.5082, "step": 1346 }, { "epoch": 2.367311072056239, "grad_norm": 0.3853002874074954, "learning_rate": 5.057385976683617e-05, "loss": 0.5136, "step": 1347 }, { "epoch": 2.3690685413005275, "grad_norm": 0.3577837769412927, "learning_rate": 5.0526510579052186e-05, "loss": 0.517, "step": 1348 }, { "epoch": 2.3708260105448153, "grad_norm": 0.38460710353567357, "learning_rate": 5.04791455385335e-05, "loss": 0.515, "step": 1349 }, { "epoch": 2.3725834797891037, "grad_norm": 0.5073839983351341, "learning_rate": 5.0431764716611024e-05, "loss": 0.5096, "step": 1350 }, { "epoch": 2.374340949033392, "grad_norm": 0.520267971321212, "learning_rate": 5.038436818463942e-05, "loss": 0.5028, "step": 1351 }, { "epoch": 2.3760984182776803, "grad_norm": 0.4655189537061093, "learning_rate": 5.0336956013996964e-05, "loss": 0.5259, "step": 1352 }, { "epoch": 2.377855887521968, "grad_norm": 0.4800430766389486, "learning_rate": 5.02895282760856e-05, "loss": 0.5095, "step": 1353 }, { "epoch": 2.3796133567662565, "grad_norm": 0.4184749344146131, "learning_rate": 5.024208504233057e-05, "loss": 0.5111, "step": 1354 }, { "epoch": 2.381370826010545, "grad_norm": 0.3373124430151287, "learning_rate": 5.019462638418059e-05, "loss": 0.5228, "step": 1355 }, { "epoch": 2.383128295254833, "grad_norm": 0.27653120404518255, "learning_rate": 5.0147152373107505e-05, "loss": 0.5064, "step": 1356 }, { "epoch": 2.3848857644991215, "grad_norm": 0.3553013387252305, "learning_rate": 5.009966308060632e-05, "loss": 0.5194, "step": 1357 }, { "epoch": 2.3866432337434094, "grad_norm": 0.4146653356017265, "learning_rate": 5.0052158578195064e-05, "loss": 0.5236, "step": 1358 }, { "epoch": 2.3884007029876977, "grad_norm": 0.4074972845251887, "learning_rate": 5.000463893741466e-05, "loss": 0.5213, "step": 1359 }, { "epoch": 2.390158172231986, "grad_norm": 0.3571218883482412, "learning_rate": 4.995710422982882e-05, "loss": 0.5146, "step": 1360 }, { "epoch": 2.3919156414762743, "grad_norm": 0.2960605172062756, "learning_rate": 4.990955452702395e-05, "loss": 0.5122, "step": 1361 }, { "epoch": 2.393673110720562, "grad_norm": 0.46806360759827, "learning_rate": 4.9861989900609036e-05, "loss": 0.5244, "step": 1362 }, { "epoch": 2.3954305799648505, "grad_norm": 0.5301011582208461, "learning_rate": 4.9814410422215576e-05, "loss": 0.5181, "step": 1363 }, { "epoch": 2.397188049209139, "grad_norm": 0.5201576602921211, "learning_rate": 4.9766816163497386e-05, "loss": 0.5164, "step": 1364 }, { "epoch": 2.398945518453427, "grad_norm": 0.4263187022455153, "learning_rate": 4.971920719613056e-05, "loss": 0.509, "step": 1365 }, { "epoch": 2.4007029876977155, "grad_norm": 0.4178181352318771, "learning_rate": 4.9671583591813356e-05, "loss": 0.5203, "step": 1366 }, { "epoch": 2.4024604569420034, "grad_norm": 0.3611883756015059, "learning_rate": 4.9623945422266023e-05, "loss": 0.5194, "step": 1367 }, { "epoch": 2.4042179261862917, "grad_norm": 0.26718990425256584, "learning_rate": 4.957629275923082e-05, "loss": 0.5184, "step": 1368 }, { "epoch": 2.40597539543058, "grad_norm": 0.4443804017883953, "learning_rate": 4.952862567447179e-05, "loss": 0.519, "step": 1369 }, { "epoch": 2.4077328646748684, "grad_norm": 0.45844279039097574, "learning_rate": 4.9480944239774695e-05, "loss": 0.5178, "step": 1370 }, { "epoch": 2.4094903339191562, "grad_norm": 0.5754908082981449, "learning_rate": 4.943324852694692e-05, "loss": 0.5207, "step": 1371 }, { "epoch": 2.4112478031634446, "grad_norm": 0.6341460020632249, "learning_rate": 4.938553860781733e-05, "loss": 0.5179, "step": 1372 }, { "epoch": 2.413005272407733, "grad_norm": 0.5785568582390078, "learning_rate": 4.933781455423623e-05, "loss": 0.5141, "step": 1373 }, { "epoch": 2.414762741652021, "grad_norm": 0.5146887243351527, "learning_rate": 4.929007643807514e-05, "loss": 0.5191, "step": 1374 }, { "epoch": 2.4165202108963095, "grad_norm": 0.373834723057991, "learning_rate": 4.9242324331226854e-05, "loss": 0.5259, "step": 1375 }, { "epoch": 2.4182776801405974, "grad_norm": 0.3256166958703551, "learning_rate": 4.919455830560514e-05, "loss": 0.5152, "step": 1376 }, { "epoch": 2.4200351493848857, "grad_norm": 0.311540862091649, "learning_rate": 4.914677843314479e-05, "loss": 0.5151, "step": 1377 }, { "epoch": 2.421792618629174, "grad_norm": 0.34567736428567347, "learning_rate": 4.909898478580141e-05, "loss": 0.5165, "step": 1378 }, { "epoch": 2.4235500878734624, "grad_norm": 0.43915163515214717, "learning_rate": 4.905117743555139e-05, "loss": 0.5237, "step": 1379 }, { "epoch": 2.4253075571177503, "grad_norm": 0.4988896217711804, "learning_rate": 4.900335645439171e-05, "loss": 0.5248, "step": 1380 }, { "epoch": 2.4270650263620386, "grad_norm": 0.5360484561546148, "learning_rate": 4.895552191433995e-05, "loss": 0.512, "step": 1381 }, { "epoch": 2.428822495606327, "grad_norm": 0.5065537430203322, "learning_rate": 4.8907673887434005e-05, "loss": 0.5209, "step": 1382 }, { "epoch": 2.4305799648506152, "grad_norm": 0.42375505321757834, "learning_rate": 4.8859812445732185e-05, "loss": 0.5151, "step": 1383 }, { "epoch": 2.4323374340949035, "grad_norm": 0.30315492082885515, "learning_rate": 4.881193766131292e-05, "loss": 0.5243, "step": 1384 }, { "epoch": 2.4340949033391914, "grad_norm": 0.3699441806011046, "learning_rate": 4.876404960627479e-05, "loss": 0.5333, "step": 1385 }, { "epoch": 2.4358523725834798, "grad_norm": 0.529629380111872, "learning_rate": 4.8716148352736345e-05, "loss": 0.5181, "step": 1386 }, { "epoch": 2.437609841827768, "grad_norm": 0.5896064092679025, "learning_rate": 4.8668233972836e-05, "loss": 0.5243, "step": 1387 }, { "epoch": 2.4393673110720564, "grad_norm": 0.388710298922272, "learning_rate": 4.862030653873195e-05, "loss": 0.5258, "step": 1388 }, { "epoch": 2.4411247803163443, "grad_norm": 0.2830186411098548, "learning_rate": 4.857236612260203e-05, "loss": 0.525, "step": 1389 }, { "epoch": 2.4428822495606326, "grad_norm": 0.3177258712513255, "learning_rate": 4.8524412796643654e-05, "loss": 0.5155, "step": 1390 }, { "epoch": 2.444639718804921, "grad_norm": 0.37713910447535737, "learning_rate": 4.8476446633073666e-05, "loss": 0.521, "step": 1391 }, { "epoch": 2.4463971880492092, "grad_norm": 0.36249165280254936, "learning_rate": 4.842846770412823e-05, "loss": 0.5189, "step": 1392 }, { "epoch": 2.4481546572934976, "grad_norm": 0.3610464515401807, "learning_rate": 4.8380476082062764e-05, "loss": 0.5179, "step": 1393 }, { "epoch": 2.4499121265377855, "grad_norm": 0.31955271138835756, "learning_rate": 4.833247183915175e-05, "loss": 0.519, "step": 1394 }, { "epoch": 2.4516695957820738, "grad_norm": 0.2953949007015286, "learning_rate": 4.8284455047688746e-05, "loss": 0.5155, "step": 1395 }, { "epoch": 2.453427065026362, "grad_norm": 0.35134660197577255, "learning_rate": 4.823642577998616e-05, "loss": 0.5211, "step": 1396 }, { "epoch": 2.4551845342706504, "grad_norm": 0.34467608096891095, "learning_rate": 4.818838410837519e-05, "loss": 0.5284, "step": 1397 }, { "epoch": 2.4569420035149383, "grad_norm": 0.35605226480760666, "learning_rate": 4.814033010520575e-05, "loss": 0.5164, "step": 1398 }, { "epoch": 2.4586994727592266, "grad_norm": 0.38046831439507317, "learning_rate": 4.809226384284626e-05, "loss": 0.5285, "step": 1399 }, { "epoch": 2.460456942003515, "grad_norm": 0.42142422002951035, "learning_rate": 4.8044185393683684e-05, "loss": 0.5114, "step": 1400 }, { "epoch": 2.4622144112478033, "grad_norm": 0.4558581049290006, "learning_rate": 4.799609483012328e-05, "loss": 0.5222, "step": 1401 }, { "epoch": 2.4639718804920916, "grad_norm": 0.35565485984912776, "learning_rate": 4.794799222458856e-05, "loss": 0.5162, "step": 1402 }, { "epoch": 2.4657293497363795, "grad_norm": 0.41626186271377946, "learning_rate": 4.78998776495212e-05, "loss": 0.5243, "step": 1403 }, { "epoch": 2.467486818980668, "grad_norm": 0.4868094372791983, "learning_rate": 4.785175117738085e-05, "loss": 0.5325, "step": 1404 }, { "epoch": 2.469244288224956, "grad_norm": 0.46476901982790286, "learning_rate": 4.780361288064514e-05, "loss": 0.516, "step": 1405 }, { "epoch": 2.4710017574692444, "grad_norm": 0.4159081827486273, "learning_rate": 4.775546283180945e-05, "loss": 0.5268, "step": 1406 }, { "epoch": 2.4727592267135323, "grad_norm": 0.24415400126448175, "learning_rate": 4.770730110338689e-05, "loss": 0.5284, "step": 1407 }, { "epoch": 2.4745166959578206, "grad_norm": 0.30706734096645216, "learning_rate": 4.765912776790818e-05, "loss": 0.5098, "step": 1408 }, { "epoch": 2.476274165202109, "grad_norm": 0.4146670596748604, "learning_rate": 4.761094289792144e-05, "loss": 0.5133, "step": 1409 }, { "epoch": 2.4780316344463973, "grad_norm": 0.4977576748480732, "learning_rate": 4.756274656599226e-05, "loss": 0.5206, "step": 1410 }, { "epoch": 2.4797891036906856, "grad_norm": 0.5543696268629039, "learning_rate": 4.7514538844703415e-05, "loss": 0.5238, "step": 1411 }, { "epoch": 2.4815465729349735, "grad_norm": 0.48237924389111375, "learning_rate": 4.7466319806654875e-05, "loss": 0.5155, "step": 1412 }, { "epoch": 2.483304042179262, "grad_norm": 0.36048568755023136, "learning_rate": 4.741808952446364e-05, "loss": 0.5176, "step": 1413 }, { "epoch": 2.48506151142355, "grad_norm": 0.2762156017568894, "learning_rate": 4.7369848070763646e-05, "loss": 0.5113, "step": 1414 }, { "epoch": 2.4868189806678385, "grad_norm": 0.3838654091970044, "learning_rate": 4.732159551820565e-05, "loss": 0.5207, "step": 1415 }, { "epoch": 2.4885764499121263, "grad_norm": 0.41450918568796447, "learning_rate": 4.7273331939457115e-05, "loss": 0.5252, "step": 1416 }, { "epoch": 2.4903339191564147, "grad_norm": 0.3956071083535752, "learning_rate": 4.722505740720211e-05, "loss": 0.5094, "step": 1417 }, { "epoch": 2.492091388400703, "grad_norm": 0.29567965183891926, "learning_rate": 4.717677199414124e-05, "loss": 0.513, "step": 1418 }, { "epoch": 2.4938488576449913, "grad_norm": 0.36176577945353006, "learning_rate": 4.712847577299144e-05, "loss": 0.5098, "step": 1419 }, { "epoch": 2.4956063268892796, "grad_norm": 0.39061646805328115, "learning_rate": 4.708016881648596e-05, "loss": 0.5106, "step": 1420 }, { "epoch": 2.4973637961335675, "grad_norm": 0.3221471269477848, "learning_rate": 4.703185119737419e-05, "loss": 0.5193, "step": 1421 }, { "epoch": 2.499121265377856, "grad_norm": 0.32033483024020637, "learning_rate": 4.698352298842159e-05, "loss": 0.5137, "step": 1422 }, { "epoch": 2.500878734622144, "grad_norm": 0.3119795332203357, "learning_rate": 4.693518426240959e-05, "loss": 0.5167, "step": 1423 }, { "epoch": 2.5026362038664325, "grad_norm": 0.3385574492356202, "learning_rate": 4.688683509213542e-05, "loss": 0.5144, "step": 1424 }, { "epoch": 2.5043936731107204, "grad_norm": 0.35604490324557747, "learning_rate": 4.683847555041205e-05, "loss": 0.5178, "step": 1425 }, { "epoch": 2.5061511423550087, "grad_norm": 0.3528614530852968, "learning_rate": 4.679010571006811e-05, "loss": 0.5185, "step": 1426 }, { "epoch": 2.507908611599297, "grad_norm": 0.2705051423156002, "learning_rate": 4.674172564394765e-05, "loss": 0.5096, "step": 1427 }, { "epoch": 2.5096660808435853, "grad_norm": 0.3103003595757415, "learning_rate": 4.669333542491024e-05, "loss": 0.526, "step": 1428 }, { "epoch": 2.5114235500878737, "grad_norm": 0.34490488596605207, "learning_rate": 4.664493512583062e-05, "loss": 0.5257, "step": 1429 }, { "epoch": 2.5131810193321615, "grad_norm": 0.3823280410355408, "learning_rate": 4.659652481959881e-05, "loss": 0.5074, "step": 1430 }, { "epoch": 2.51493848857645, "grad_norm": 0.3470614622028184, "learning_rate": 4.654810457911982e-05, "loss": 0.5208, "step": 1431 }, { "epoch": 2.516695957820738, "grad_norm": 0.34079089738432505, "learning_rate": 4.649967447731367e-05, "loss": 0.5172, "step": 1432 }, { "epoch": 2.5184534270650265, "grad_norm": 0.33995514674870236, "learning_rate": 4.645123458711522e-05, "loss": 0.516, "step": 1433 }, { "epoch": 2.5202108963093144, "grad_norm": 0.3298515474621665, "learning_rate": 4.6402784981474076e-05, "loss": 0.5263, "step": 1434 }, { "epoch": 2.5219683655536027, "grad_norm": 0.2988023477910157, "learning_rate": 4.635432573335446e-05, "loss": 0.5167, "step": 1435 }, { "epoch": 2.523725834797891, "grad_norm": 0.2597671883218545, "learning_rate": 4.630585691573513e-05, "loss": 0.5194, "step": 1436 }, { "epoch": 2.5254833040421794, "grad_norm": 0.3511030056831073, "learning_rate": 4.625737860160924e-05, "loss": 0.52, "step": 1437 }, { "epoch": 2.5272407732864677, "grad_norm": 0.3326700259791027, "learning_rate": 4.620889086398427e-05, "loss": 0.511, "step": 1438 }, { "epoch": 2.5289982425307556, "grad_norm": 0.31992964884387154, "learning_rate": 4.616039377588185e-05, "loss": 0.514, "step": 1439 }, { "epoch": 2.530755711775044, "grad_norm": 0.2993669571472975, "learning_rate": 4.611188741033774e-05, "loss": 0.5194, "step": 1440 }, { "epoch": 2.532513181019332, "grad_norm": 0.27990083804746985, "learning_rate": 4.6063371840401646e-05, "loss": 0.5125, "step": 1441 }, { "epoch": 2.5342706502636205, "grad_norm": 0.35191598743967006, "learning_rate": 4.601484713913714e-05, "loss": 0.522, "step": 1442 }, { "epoch": 2.5360281195079084, "grad_norm": 0.3454536577050063, "learning_rate": 4.596631337962155e-05, "loss": 0.5244, "step": 1443 }, { "epoch": 2.5377855887521967, "grad_norm": 0.3706680227474578, "learning_rate": 4.591777063494582e-05, "loss": 0.5186, "step": 1444 }, { "epoch": 2.539543057996485, "grad_norm": 0.3268078286541158, "learning_rate": 4.586921897821447e-05, "loss": 0.5278, "step": 1445 }, { "epoch": 2.5413005272407734, "grad_norm": 0.2737631553952414, "learning_rate": 4.5820658482545405e-05, "loss": 0.509, "step": 1446 }, { "epoch": 2.5430579964850617, "grad_norm": 0.30377521030742116, "learning_rate": 4.577208922106987e-05, "loss": 0.5073, "step": 1447 }, { "epoch": 2.5448154657293496, "grad_norm": 0.35024548184618504, "learning_rate": 4.572351126693227e-05, "loss": 0.5201, "step": 1448 }, { "epoch": 2.546572934973638, "grad_norm": 0.35259419403679937, "learning_rate": 4.567492469329013e-05, "loss": 0.5309, "step": 1449 }, { "epoch": 2.5483304042179262, "grad_norm": 0.32526401926242743, "learning_rate": 4.562632957331397e-05, "loss": 0.5179, "step": 1450 }, { "epoch": 2.5500878734622145, "grad_norm": 0.34552230404739925, "learning_rate": 4.5577725980187154e-05, "loss": 0.5267, "step": 1451 }, { "epoch": 2.5518453427065024, "grad_norm": 0.4193508275458657, "learning_rate": 4.552911398710581e-05, "loss": 0.5317, "step": 1452 }, { "epoch": 2.5536028119507908, "grad_norm": 0.5242425818102595, "learning_rate": 4.548049366727873e-05, "loss": 0.5207, "step": 1453 }, { "epoch": 2.555360281195079, "grad_norm": 0.37895193134662436, "learning_rate": 4.5431865093927225e-05, "loss": 0.5131, "step": 1454 }, { "epoch": 2.5571177504393674, "grad_norm": 0.26772098357922736, "learning_rate": 4.538322834028505e-05, "loss": 0.5092, "step": 1455 }, { "epoch": 2.5588752196836557, "grad_norm": 0.41435690549784826, "learning_rate": 4.5334583479598286e-05, "loss": 0.5294, "step": 1456 }, { "epoch": 2.5606326889279436, "grad_norm": 0.37907990791679275, "learning_rate": 4.5285930585125205e-05, "loss": 0.5129, "step": 1457 }, { "epoch": 2.562390158172232, "grad_norm": 0.31616714329870205, "learning_rate": 4.523726973013621e-05, "loss": 0.5084, "step": 1458 }, { "epoch": 2.5641476274165202, "grad_norm": 0.4337800311039469, "learning_rate": 4.5188600987913625e-05, "loss": 0.5149, "step": 1459 }, { "epoch": 2.5659050966608086, "grad_norm": 0.4329280635608506, "learning_rate": 4.513992443175173e-05, "loss": 0.5143, "step": 1460 }, { "epoch": 2.5676625659050965, "grad_norm": 0.374849312769997, "learning_rate": 4.5091240134956535e-05, "loss": 0.5231, "step": 1461 }, { "epoch": 2.5694200351493848, "grad_norm": 0.2541960880403694, "learning_rate": 4.50425481708457e-05, "loss": 0.5159, "step": 1462 }, { "epoch": 2.571177504393673, "grad_norm": 0.2777182689177767, "learning_rate": 4.4993848612748446e-05, "loss": 0.5066, "step": 1463 }, { "epoch": 2.5729349736379614, "grad_norm": 0.33116028042510365, "learning_rate": 4.494514153400545e-05, "loss": 0.5194, "step": 1464 }, { "epoch": 2.5746924428822497, "grad_norm": 0.3441541280714304, "learning_rate": 4.4896427007968655e-05, "loss": 0.5148, "step": 1465 }, { "epoch": 2.5764499121265376, "grad_norm": 0.40990835820198723, "learning_rate": 4.484770510800128e-05, "loss": 0.5228, "step": 1466 }, { "epoch": 2.578207381370826, "grad_norm": 0.40840957748787277, "learning_rate": 4.479897590747761e-05, "loss": 0.5228, "step": 1467 }, { "epoch": 2.5799648506151143, "grad_norm": 0.2972577278217296, "learning_rate": 4.4750239479782965e-05, "loss": 0.5218, "step": 1468 }, { "epoch": 2.5817223198594026, "grad_norm": 0.3090239144596006, "learning_rate": 4.470149589831351e-05, "loss": 0.5187, "step": 1469 }, { "epoch": 2.5834797891036905, "grad_norm": 0.2795664272728976, "learning_rate": 4.46527452364762e-05, "loss": 0.5218, "step": 1470 }, { "epoch": 2.585237258347979, "grad_norm": 0.3989270446469345, "learning_rate": 4.460398756768864e-05, "loss": 0.5196, "step": 1471 }, { "epoch": 2.586994727592267, "grad_norm": 0.39762237205877227, "learning_rate": 4.4555222965379005e-05, "loss": 0.5268, "step": 1472 }, { "epoch": 2.5887521968365554, "grad_norm": 0.29362942486407134, "learning_rate": 4.4506451502985915e-05, "loss": 0.5135, "step": 1473 }, { "epoch": 2.5905096660808438, "grad_norm": 0.4379266454546761, "learning_rate": 4.4457673253958305e-05, "loss": 0.5147, "step": 1474 }, { "epoch": 2.5922671353251316, "grad_norm": 0.4079714863549631, "learning_rate": 4.440888829175533e-05, "loss": 0.5116, "step": 1475 }, { "epoch": 2.59402460456942, "grad_norm": 0.25492292028289804, "learning_rate": 4.436009668984626e-05, "loss": 0.5095, "step": 1476 }, { "epoch": 2.5957820738137083, "grad_norm": 0.3307417826703091, "learning_rate": 4.431129852171037e-05, "loss": 0.5137, "step": 1477 }, { "epoch": 2.5975395430579966, "grad_norm": 0.3635250002522398, "learning_rate": 4.426249386083684e-05, "loss": 0.522, "step": 1478 }, { "epoch": 2.5992970123022845, "grad_norm": 0.27854178763536747, "learning_rate": 4.421368278072456e-05, "loss": 0.5208, "step": 1479 }, { "epoch": 2.601054481546573, "grad_norm": 0.3199040882511732, "learning_rate": 4.416486535488219e-05, "loss": 0.5226, "step": 1480 }, { "epoch": 2.602811950790861, "grad_norm": 0.35294798996002, "learning_rate": 4.411604165682786e-05, "loss": 0.5046, "step": 1481 }, { "epoch": 2.6045694200351495, "grad_norm": 0.3243300660188701, "learning_rate": 4.406721176008916e-05, "loss": 0.5012, "step": 1482 }, { "epoch": 2.606326889279438, "grad_norm": 0.26223834185875206, "learning_rate": 4.401837573820307e-05, "loss": 0.5239, "step": 1483 }, { "epoch": 2.608084358523726, "grad_norm": 0.3157756857754041, "learning_rate": 4.396953366471572e-05, "loss": 0.5079, "step": 1484 }, { "epoch": 2.609841827768014, "grad_norm": 0.3130620641570958, "learning_rate": 4.392068561318244e-05, "loss": 0.5281, "step": 1485 }, { "epoch": 2.6115992970123023, "grad_norm": 0.3217136470754769, "learning_rate": 4.3871831657167456e-05, "loss": 0.5156, "step": 1486 }, { "epoch": 2.6133567662565906, "grad_norm": 0.40287631463203416, "learning_rate": 4.382297187024398e-05, "loss": 0.521, "step": 1487 }, { "epoch": 2.6151142355008785, "grad_norm": 0.31871552859841634, "learning_rate": 4.3774106325993946e-05, "loss": 0.5148, "step": 1488 }, { "epoch": 2.616871704745167, "grad_norm": 0.29155578034473034, "learning_rate": 4.3725235098007995e-05, "loss": 0.5178, "step": 1489 }, { "epoch": 2.618629173989455, "grad_norm": 0.27111406626762363, "learning_rate": 4.367635825988531e-05, "loss": 0.5172, "step": 1490 }, { "epoch": 2.6203866432337435, "grad_norm": 0.2510304312798982, "learning_rate": 4.362747588523354e-05, "loss": 0.5246, "step": 1491 }, { "epoch": 2.622144112478032, "grad_norm": 0.2706767691964339, "learning_rate": 4.3578588047668626e-05, "loss": 0.5101, "step": 1492 }, { "epoch": 2.62390158172232, "grad_norm": 0.24050819531025944, "learning_rate": 4.352969482081479e-05, "loss": 0.5048, "step": 1493 }, { "epoch": 2.625659050966608, "grad_norm": 0.24944768415767293, "learning_rate": 4.348079627830434e-05, "loss": 0.5117, "step": 1494 }, { "epoch": 2.6274165202108963, "grad_norm": 0.2356874275098619, "learning_rate": 4.3431892493777604e-05, "loss": 0.5225, "step": 1495 }, { "epoch": 2.6291739894551847, "grad_norm": 0.23926097045677566, "learning_rate": 4.338298354088279e-05, "loss": 0.5113, "step": 1496 }, { "epoch": 2.6309314586994725, "grad_norm": 0.2595761609286993, "learning_rate": 4.33340694932759e-05, "loss": 0.513, "step": 1497 }, { "epoch": 2.632688927943761, "grad_norm": 0.22920158296515228, "learning_rate": 4.328515042462062e-05, "loss": 0.5183, "step": 1498 }, { "epoch": 2.634446397188049, "grad_norm": 0.23854350683746975, "learning_rate": 4.323622640858816e-05, "loss": 0.5332, "step": 1499 }, { "epoch": 2.6362038664323375, "grad_norm": 0.277436964750548, "learning_rate": 4.3187297518857205e-05, "loss": 0.5176, "step": 1500 }, { "epoch": 2.637961335676626, "grad_norm": 0.2299596741647995, "learning_rate": 4.313836382911381e-05, "loss": 0.504, "step": 1501 }, { "epoch": 2.639718804920914, "grad_norm": 0.26155099494034323, "learning_rate": 4.308942541305119e-05, "loss": 0.519, "step": 1502 }, { "epoch": 2.641476274165202, "grad_norm": 0.2523908325037982, "learning_rate": 4.304048234436973e-05, "loss": 0.5195, "step": 1503 }, { "epoch": 2.6432337434094904, "grad_norm": 0.29348781641071964, "learning_rate": 4.299153469677681e-05, "loss": 0.523, "step": 1504 }, { "epoch": 2.6449912126537787, "grad_norm": 0.2709160632339866, "learning_rate": 4.2942582543986705e-05, "loss": 0.5195, "step": 1505 }, { "epoch": 2.6467486818980666, "grad_norm": 0.288523312347925, "learning_rate": 4.289362595972046e-05, "loss": 0.5319, "step": 1506 }, { "epoch": 2.648506151142355, "grad_norm": 0.30101648786103313, "learning_rate": 4.284466501770582e-05, "loss": 0.5295, "step": 1507 }, { "epoch": 2.650263620386643, "grad_norm": 0.3485425734704446, "learning_rate": 4.279569979167708e-05, "loss": 0.5177, "step": 1508 }, { "epoch": 2.6520210896309315, "grad_norm": 0.41241655650041714, "learning_rate": 4.274673035537495e-05, "loss": 0.5075, "step": 1509 }, { "epoch": 2.65377855887522, "grad_norm": 0.3331095730849406, "learning_rate": 4.269775678254656e-05, "loss": 0.5137, "step": 1510 }, { "epoch": 2.655536028119508, "grad_norm": 0.25149267113486473, "learning_rate": 4.264877914694523e-05, "loss": 0.5229, "step": 1511 }, { "epoch": 2.657293497363796, "grad_norm": 0.2865938905443983, "learning_rate": 4.259979752233034e-05, "loss": 0.5107, "step": 1512 }, { "epoch": 2.6590509666080844, "grad_norm": 0.33798382225835, "learning_rate": 4.255081198246739e-05, "loss": 0.5072, "step": 1513 }, { "epoch": 2.6608084358523727, "grad_norm": 0.4100437644303138, "learning_rate": 4.2501822601127675e-05, "loss": 0.5269, "step": 1514 }, { "epoch": 2.6625659050966606, "grad_norm": 0.3766626221635528, "learning_rate": 4.2452829452088346e-05, "loss": 0.5183, "step": 1515 }, { "epoch": 2.664323374340949, "grad_norm": 0.29306206940442636, "learning_rate": 4.24038326091322e-05, "loss": 0.5335, "step": 1516 }, { "epoch": 2.6660808435852372, "grad_norm": 0.2719068291260996, "learning_rate": 4.235483214604756e-05, "loss": 0.5098, "step": 1517 }, { "epoch": 2.6678383128295255, "grad_norm": 0.35923753873096137, "learning_rate": 4.230582813662828e-05, "loss": 0.5217, "step": 1518 }, { "epoch": 2.669595782073814, "grad_norm": 0.3698471445648079, "learning_rate": 4.2256820654673506e-05, "loss": 0.5184, "step": 1519 }, { "epoch": 2.671353251318102, "grad_norm": 0.35002453063075956, "learning_rate": 4.220780977398761e-05, "loss": 0.5262, "step": 1520 }, { "epoch": 2.67311072056239, "grad_norm": 0.38589916440456895, "learning_rate": 4.215879556838008e-05, "loss": 0.5145, "step": 1521 }, { "epoch": 2.6748681898066784, "grad_norm": 0.3704989763357141, "learning_rate": 4.2109778111665445e-05, "loss": 0.5223, "step": 1522 }, { "epoch": 2.6766256590509667, "grad_norm": 0.2873408984769108, "learning_rate": 4.20607574776631e-05, "loss": 0.5091, "step": 1523 }, { "epoch": 2.6783831282952546, "grad_norm": 0.3695174743818838, "learning_rate": 4.2011733740197244e-05, "loss": 0.5215, "step": 1524 }, { "epoch": 2.680140597539543, "grad_norm": 0.45572167691666915, "learning_rate": 4.1962706973096726e-05, "loss": 0.5178, "step": 1525 }, { "epoch": 2.6818980667838312, "grad_norm": 0.3387773608865029, "learning_rate": 4.191367725019499e-05, "loss": 0.5193, "step": 1526 }, { "epoch": 2.6836555360281196, "grad_norm": 0.317736176226776, "learning_rate": 4.186464464532987e-05, "loss": 0.5099, "step": 1527 }, { "epoch": 2.685413005272408, "grad_norm": 0.43042464477374687, "learning_rate": 4.1815609232343636e-05, "loss": 0.5332, "step": 1528 }, { "epoch": 2.687170474516696, "grad_norm": 0.36220452272992176, "learning_rate": 4.17665710850827e-05, "loss": 0.4958, "step": 1529 }, { "epoch": 2.688927943760984, "grad_norm": 0.32164132885133734, "learning_rate": 4.171753027739764e-05, "loss": 0.5194, "step": 1530 }, { "epoch": 2.6906854130052724, "grad_norm": 0.32310466077931455, "learning_rate": 4.166848688314303e-05, "loss": 0.502, "step": 1531 }, { "epoch": 2.6924428822495607, "grad_norm": 0.2765814927724835, "learning_rate": 4.16194409761773e-05, "loss": 0.5123, "step": 1532 }, { "epoch": 2.6942003514938486, "grad_norm": 0.2858266619428896, "learning_rate": 4.157039263036275e-05, "loss": 0.5119, "step": 1533 }, { "epoch": 2.695957820738137, "grad_norm": 0.3449038051645841, "learning_rate": 4.152134191956527e-05, "loss": 0.5201, "step": 1534 }, { "epoch": 2.6977152899824253, "grad_norm": 0.39741707403153664, "learning_rate": 4.147228891765436e-05, "loss": 0.5082, "step": 1535 }, { "epoch": 2.6994727592267136, "grad_norm": 0.3687931202369898, "learning_rate": 4.142323369850295e-05, "loss": 0.5271, "step": 1536 }, { "epoch": 2.701230228471002, "grad_norm": 0.28001479876593843, "learning_rate": 4.1374176335987296e-05, "loss": 0.5185, "step": 1537 }, { "epoch": 2.7029876977152902, "grad_norm": 0.26360903165843486, "learning_rate": 4.1325116903986927e-05, "loss": 0.5295, "step": 1538 }, { "epoch": 2.704745166959578, "grad_norm": 0.3254579518009382, "learning_rate": 4.1276055476384434e-05, "loss": 0.5142, "step": 1539 }, { "epoch": 2.7065026362038664, "grad_norm": 0.36003923311145614, "learning_rate": 4.122699212706547e-05, "loss": 0.5137, "step": 1540 }, { "epoch": 2.7082601054481548, "grad_norm": 0.37819443149190535, "learning_rate": 4.117792692991854e-05, "loss": 0.5096, "step": 1541 }, { "epoch": 2.7100175746924426, "grad_norm": 0.3207353855875648, "learning_rate": 4.1128859958834926e-05, "loss": 0.5082, "step": 1542 }, { "epoch": 2.711775043936731, "grad_norm": 0.2611493834483945, "learning_rate": 4.107979128770862e-05, "loss": 0.517, "step": 1543 }, { "epoch": 2.7135325131810193, "grad_norm": 0.27786582860985193, "learning_rate": 4.103072099043615e-05, "loss": 0.5235, "step": 1544 }, { "epoch": 2.7152899824253076, "grad_norm": 0.32804979237743076, "learning_rate": 4.0981649140916495e-05, "loss": 0.506, "step": 1545 }, { "epoch": 2.717047451669596, "grad_norm": 0.3665000600847819, "learning_rate": 4.0932575813050975e-05, "loss": 0.5028, "step": 1546 }, { "epoch": 2.7188049209138843, "grad_norm": 0.28238436355006324, "learning_rate": 4.0883501080743136e-05, "loss": 0.5227, "step": 1547 }, { "epoch": 2.720562390158172, "grad_norm": 0.27635549937476306, "learning_rate": 4.083442501789864e-05, "loss": 0.5182, "step": 1548 }, { "epoch": 2.7223198594024605, "grad_norm": 0.30392852596774955, "learning_rate": 4.078534769842513e-05, "loss": 0.5165, "step": 1549 }, { "epoch": 2.724077328646749, "grad_norm": 0.3243431383264452, "learning_rate": 4.07362691962322e-05, "loss": 0.5241, "step": 1550 }, { "epoch": 2.7258347978910367, "grad_norm": 0.32356255456122573, "learning_rate": 4.068718958523116e-05, "loss": 0.5043, "step": 1551 }, { "epoch": 2.727592267135325, "grad_norm": 0.3080804449820484, "learning_rate": 4.063810893933502e-05, "loss": 0.5143, "step": 1552 }, { "epoch": 2.7293497363796133, "grad_norm": 0.2775964081180213, "learning_rate": 4.0589027332458346e-05, "loss": 0.5174, "step": 1553 }, { "epoch": 2.7311072056239016, "grad_norm": 0.29158954300724654, "learning_rate": 4.053994483851716e-05, "loss": 0.5204, "step": 1554 }, { "epoch": 2.73286467486819, "grad_norm": 0.34703081567283967, "learning_rate": 4.04908615314288e-05, "loss": 0.5178, "step": 1555 }, { "epoch": 2.7346221441124783, "grad_norm": 0.4120483464596026, "learning_rate": 4.0441777485111856e-05, "loss": 0.5084, "step": 1556 }, { "epoch": 2.736379613356766, "grad_norm": 0.3311075873510682, "learning_rate": 4.039269277348599e-05, "loss": 0.5228, "step": 1557 }, { "epoch": 2.7381370826010545, "grad_norm": 0.29758263633592064, "learning_rate": 4.034360747047191e-05, "loss": 0.5213, "step": 1558 }, { "epoch": 2.739894551845343, "grad_norm": 0.2815518293546168, "learning_rate": 4.0294521649991186e-05, "loss": 0.5139, "step": 1559 }, { "epoch": 2.7416520210896307, "grad_norm": 0.2980315228888719, "learning_rate": 4.024543538596619e-05, "loss": 0.5119, "step": 1560 }, { "epoch": 2.743409490333919, "grad_norm": 0.355882195371422, "learning_rate": 4.019634875231993e-05, "loss": 0.512, "step": 1561 }, { "epoch": 2.7451669595782073, "grad_norm": 0.3151245339074043, "learning_rate": 4.014726182297599e-05, "loss": 0.5174, "step": 1562 }, { "epoch": 2.7469244288224957, "grad_norm": 0.27413547521543996, "learning_rate": 4.009817467185842e-05, "loss": 0.5198, "step": 1563 }, { "epoch": 2.748681898066784, "grad_norm": 0.23055513305110661, "learning_rate": 4.004908737289156e-05, "loss": 0.529, "step": 1564 }, { "epoch": 2.7504393673110723, "grad_norm": 0.33694115012009873, "learning_rate": 4e-05, "loss": 0.5102, "step": 1565 }, { "epoch": 2.75219683655536, "grad_norm": 0.38867661707361906, "learning_rate": 3.9950912627108456e-05, "loss": 0.5293, "step": 1566 }, { "epoch": 2.7539543057996485, "grad_norm": 0.3033716579152999, "learning_rate": 3.9901825328141594e-05, "loss": 0.5241, "step": 1567 }, { "epoch": 2.755711775043937, "grad_norm": 0.30559988081897366, "learning_rate": 3.9852738177024014e-05, "loss": 0.5167, "step": 1568 }, { "epoch": 2.7574692442882247, "grad_norm": 0.2855696595802452, "learning_rate": 3.980365124768008e-05, "loss": 0.5213, "step": 1569 }, { "epoch": 2.759226713532513, "grad_norm": 0.2879287936027693, "learning_rate": 3.975456461403383e-05, "loss": 0.5129, "step": 1570 }, { "epoch": 2.7609841827768014, "grad_norm": 0.34783531675659246, "learning_rate": 3.970547835000882e-05, "loss": 0.5233, "step": 1571 }, { "epoch": 2.7627416520210897, "grad_norm": 0.33888947836883715, "learning_rate": 3.96563925295281e-05, "loss": 0.5254, "step": 1572 }, { "epoch": 2.764499121265378, "grad_norm": 0.20895805913094265, "learning_rate": 3.960730722651402e-05, "loss": 0.502, "step": 1573 }, { "epoch": 2.7662565905096663, "grad_norm": 0.34972199582333147, "learning_rate": 3.955822251488816e-05, "loss": 0.5084, "step": 1574 }, { "epoch": 2.768014059753954, "grad_norm": 0.26891375055839345, "learning_rate": 3.950913846857121e-05, "loss": 0.5202, "step": 1575 }, { "epoch": 2.7697715289982425, "grad_norm": 0.3043064781866446, "learning_rate": 3.9460055161482856e-05, "loss": 0.5048, "step": 1576 }, { "epoch": 2.771528998242531, "grad_norm": 0.22792633324728875, "learning_rate": 3.9410972667541674e-05, "loss": 0.51, "step": 1577 }, { "epoch": 2.7732864674868187, "grad_norm": 0.32171401680705364, "learning_rate": 3.936189106066499e-05, "loss": 0.5208, "step": 1578 }, { "epoch": 2.775043936731107, "grad_norm": 0.3079475748858616, "learning_rate": 3.9312810414768854e-05, "loss": 0.5213, "step": 1579 }, { "epoch": 2.7768014059753954, "grad_norm": 0.34162619120039306, "learning_rate": 3.9263730803767816e-05, "loss": 0.5357, "step": 1580 }, { "epoch": 2.7785588752196837, "grad_norm": 0.40974210038000547, "learning_rate": 3.921465230157488e-05, "loss": 0.5079, "step": 1581 }, { "epoch": 2.780316344463972, "grad_norm": 0.24011165636958212, "learning_rate": 3.916557498210138e-05, "loss": 0.5092, "step": 1582 }, { "epoch": 2.7820738137082603, "grad_norm": 0.25615232322469006, "learning_rate": 3.911649891925686e-05, "loss": 0.5177, "step": 1583 }, { "epoch": 2.7838312829525482, "grad_norm": 0.24713987672873622, "learning_rate": 3.906742418694903e-05, "loss": 0.5156, "step": 1584 }, { "epoch": 2.7855887521968365, "grad_norm": 0.2202986443857434, "learning_rate": 3.901835085908352e-05, "loss": 0.5176, "step": 1585 }, { "epoch": 2.787346221441125, "grad_norm": 0.22920014755738477, "learning_rate": 3.8969279009563865e-05, "loss": 0.5219, "step": 1586 }, { "epoch": 2.7891036906854128, "grad_norm": 0.19606352300718707, "learning_rate": 3.89202087122914e-05, "loss": 0.5181, "step": 1587 }, { "epoch": 2.790861159929701, "grad_norm": 0.26057680017298085, "learning_rate": 3.887114004116508e-05, "loss": 0.5144, "step": 1588 }, { "epoch": 2.7926186291739894, "grad_norm": 0.2693503612360842, "learning_rate": 3.8822073070081475e-05, "loss": 0.5202, "step": 1589 }, { "epoch": 2.7943760984182777, "grad_norm": 0.263305272269591, "learning_rate": 3.8773007872934546e-05, "loss": 0.5058, "step": 1590 }, { "epoch": 2.796133567662566, "grad_norm": 0.25591470918195547, "learning_rate": 3.872394452361558e-05, "loss": 0.5207, "step": 1591 }, { "epoch": 2.7978910369068544, "grad_norm": 0.34670142057444425, "learning_rate": 3.8674883096013094e-05, "loss": 0.5087, "step": 1592 }, { "epoch": 2.7996485061511422, "grad_norm": 0.2887464036110664, "learning_rate": 3.862582366401271e-05, "loss": 0.5131, "step": 1593 }, { "epoch": 2.8014059753954306, "grad_norm": 0.22035366203593393, "learning_rate": 3.857676630149706e-05, "loss": 0.5176, "step": 1594 }, { "epoch": 2.803163444639719, "grad_norm": 0.22716577357642156, "learning_rate": 3.852771108234565e-05, "loss": 0.5299, "step": 1595 }, { "epoch": 2.8049209138840068, "grad_norm": 0.2504052889287094, "learning_rate": 3.847865808043475e-05, "loss": 0.522, "step": 1596 }, { "epoch": 2.806678383128295, "grad_norm": 0.2977919957914091, "learning_rate": 3.842960736963727e-05, "loss": 0.516, "step": 1597 }, { "epoch": 2.8084358523725834, "grad_norm": 0.2738544084380623, "learning_rate": 3.83805590238227e-05, "loss": 0.5042, "step": 1598 }, { "epoch": 2.8101933216168717, "grad_norm": 0.22480248639422165, "learning_rate": 3.833151311685699e-05, "loss": 0.5237, "step": 1599 }, { "epoch": 2.81195079086116, "grad_norm": 0.2450769222574398, "learning_rate": 3.828246972260237e-05, "loss": 0.5018, "step": 1600 }, { "epoch": 2.8137082601054484, "grad_norm": 0.27309670694619637, "learning_rate": 3.8233428914917314e-05, "loss": 0.5154, "step": 1601 }, { "epoch": 2.8154657293497363, "grad_norm": 0.5612610437110039, "learning_rate": 3.8184390767656384e-05, "loss": 0.5494, "step": 1602 }, { "epoch": 2.8172231985940246, "grad_norm": 0.3432724703293295, "learning_rate": 3.8135355354670135e-05, "loss": 0.5066, "step": 1603 }, { "epoch": 2.818980667838313, "grad_norm": 0.4113596064238729, "learning_rate": 3.808632274980503e-05, "loss": 0.5192, "step": 1604 }, { "epoch": 2.820738137082601, "grad_norm": 0.3799444801671668, "learning_rate": 3.803729302690328e-05, "loss": 0.5086, "step": 1605 }, { "epoch": 2.822495606326889, "grad_norm": 0.31185893441144164, "learning_rate": 3.798826625980277e-05, "loss": 0.5156, "step": 1606 }, { "epoch": 2.8242530755711774, "grad_norm": 0.34909866307404125, "learning_rate": 3.7939242522336906e-05, "loss": 0.4933, "step": 1607 }, { "epoch": 2.8260105448154658, "grad_norm": 0.36682058005926194, "learning_rate": 3.789022188833456e-05, "loss": 0.5194, "step": 1608 }, { "epoch": 2.827768014059754, "grad_norm": 0.3090962277788772, "learning_rate": 3.7841204431619926e-05, "loss": 0.5072, "step": 1609 }, { "epoch": 2.8295254833040424, "grad_norm": 0.3368303894329845, "learning_rate": 3.7792190226012405e-05, "loss": 0.5111, "step": 1610 }, { "epoch": 2.8312829525483303, "grad_norm": 0.2768290678825463, "learning_rate": 3.774317934532651e-05, "loss": 0.5009, "step": 1611 }, { "epoch": 2.8330404217926186, "grad_norm": 0.292235206580178, "learning_rate": 3.769417186337173e-05, "loss": 0.5138, "step": 1612 }, { "epoch": 2.834797891036907, "grad_norm": 0.2608163492760778, "learning_rate": 3.764516785395244e-05, "loss": 0.5199, "step": 1613 }, { "epoch": 2.836555360281195, "grad_norm": 0.3556593133732311, "learning_rate": 3.759616739086782e-05, "loss": 0.5147, "step": 1614 }, { "epoch": 2.838312829525483, "grad_norm": 0.34898837154890827, "learning_rate": 3.754717054791166e-05, "loss": 0.5079, "step": 1615 }, { "epoch": 2.8400702987697715, "grad_norm": 0.25146605239008823, "learning_rate": 3.749817739887233e-05, "loss": 0.5443, "step": 1616 }, { "epoch": 2.84182776801406, "grad_norm": 0.2650924987293867, "learning_rate": 3.744918801753263e-05, "loss": 0.5045, "step": 1617 }, { "epoch": 2.843585237258348, "grad_norm": 0.3063121990446351, "learning_rate": 3.740020247766966e-05, "loss": 0.5027, "step": 1618 }, { "epoch": 2.8453427065026364, "grad_norm": 0.3528553986689603, "learning_rate": 3.735122085305479e-05, "loss": 0.5218, "step": 1619 }, { "epoch": 2.8471001757469243, "grad_norm": 0.2526196581080518, "learning_rate": 3.7302243217453443e-05, "loss": 0.5223, "step": 1620 }, { "epoch": 2.8488576449912126, "grad_norm": 0.27650716522826324, "learning_rate": 3.725326964462506e-05, "loss": 0.5351, "step": 1621 }, { "epoch": 2.850615114235501, "grad_norm": 0.3337288892300705, "learning_rate": 3.720430020832295e-05, "loss": 0.5104, "step": 1622 }, { "epoch": 2.852372583479789, "grad_norm": 0.27118311665717576, "learning_rate": 3.715533498229419e-05, "loss": 0.5271, "step": 1623 }, { "epoch": 2.854130052724077, "grad_norm": 0.3045779784819228, "learning_rate": 3.7106374040279544e-05, "loss": 0.5152, "step": 1624 }, { "epoch": 2.8558875219683655, "grad_norm": 0.31878489374237406, "learning_rate": 3.705741745601331e-05, "loss": 0.5126, "step": 1625 }, { "epoch": 2.857644991212654, "grad_norm": 0.2682796559352533, "learning_rate": 3.7008465303223207e-05, "loss": 0.5159, "step": 1626 }, { "epoch": 2.859402460456942, "grad_norm": 0.22378763615201488, "learning_rate": 3.6959517655630285e-05, "loss": 0.5222, "step": 1627 }, { "epoch": 2.8611599297012305, "grad_norm": 0.27840353963736947, "learning_rate": 3.691057458694882e-05, "loss": 0.5145, "step": 1628 }, { "epoch": 2.8629173989455183, "grad_norm": 0.2902125149600713, "learning_rate": 3.686163617088621e-05, "loss": 0.5187, "step": 1629 }, { "epoch": 2.8646748681898067, "grad_norm": 0.3202713963346325, "learning_rate": 3.68127024811428e-05, "loss": 0.5154, "step": 1630 }, { "epoch": 2.866432337434095, "grad_norm": 0.32532956604760793, "learning_rate": 3.676377359141186e-05, "loss": 0.5205, "step": 1631 }, { "epoch": 2.868189806678383, "grad_norm": 0.26277870798977054, "learning_rate": 3.67148495753794e-05, "loss": 0.5181, "step": 1632 }, { "epoch": 2.869947275922671, "grad_norm": 0.3969386833376756, "learning_rate": 3.66659305067241e-05, "loss": 0.5095, "step": 1633 }, { "epoch": 2.8717047451669595, "grad_norm": 0.4565569607922183, "learning_rate": 3.661701645911722e-05, "loss": 0.5164, "step": 1634 }, { "epoch": 2.873462214411248, "grad_norm": 0.33186195827288406, "learning_rate": 3.656810750622241e-05, "loss": 0.5254, "step": 1635 }, { "epoch": 2.875219683655536, "grad_norm": 0.26518920802667134, "learning_rate": 3.651920372169568e-05, "loss": 0.5064, "step": 1636 }, { "epoch": 2.8769771528998245, "grad_norm": 0.31774470946790484, "learning_rate": 3.647030517918523e-05, "loss": 0.5164, "step": 1637 }, { "epoch": 2.8787346221441124, "grad_norm": 0.3116010189097384, "learning_rate": 3.642141195233138e-05, "loss": 0.5171, "step": 1638 }, { "epoch": 2.8804920913884007, "grad_norm": 0.22863833482199553, "learning_rate": 3.637252411476647e-05, "loss": 0.5264, "step": 1639 }, { "epoch": 2.882249560632689, "grad_norm": 0.22478211665227865, "learning_rate": 3.63236417401147e-05, "loss": 0.5055, "step": 1640 }, { "epoch": 2.884007029876977, "grad_norm": 0.206067099001712, "learning_rate": 3.627476490199202e-05, "loss": 0.5193, "step": 1641 }, { "epoch": 2.885764499121265, "grad_norm": 0.20095598473004392, "learning_rate": 3.6225893674006074e-05, "loss": 0.515, "step": 1642 }, { "epoch": 2.8875219683655535, "grad_norm": 0.2263296342101525, "learning_rate": 3.617702812975603e-05, "loss": 0.512, "step": 1643 }, { "epoch": 2.889279437609842, "grad_norm": 0.31031885070289067, "learning_rate": 3.612816834283255e-05, "loss": 0.5242, "step": 1644 }, { "epoch": 2.89103690685413, "grad_norm": 0.31376790924991127, "learning_rate": 3.6079314386817575e-05, "loss": 0.5234, "step": 1645 }, { "epoch": 2.8927943760984185, "grad_norm": 0.23316984147684364, "learning_rate": 3.603046633528429e-05, "loss": 0.5194, "step": 1646 }, { "epoch": 2.8945518453427064, "grad_norm": 0.35771562089839887, "learning_rate": 3.598162426179695e-05, "loss": 0.5257, "step": 1647 }, { "epoch": 2.8963093145869947, "grad_norm": 0.2836305542534022, "learning_rate": 3.5932788239910844e-05, "loss": 0.5197, "step": 1648 }, { "epoch": 2.898066783831283, "grad_norm": 2.254357481556403, "learning_rate": 3.5883958343172155e-05, "loss": 0.5527, "step": 1649 }, { "epoch": 2.899824253075571, "grad_norm": 0.3939745735305796, "learning_rate": 3.5835134645117814e-05, "loss": 0.5088, "step": 1650 }, { "epoch": 2.9015817223198592, "grad_norm": 0.48075910655678133, "learning_rate": 3.578631721927545e-05, "loss": 0.5039, "step": 1651 }, { "epoch": 2.9033391915641475, "grad_norm": 0.39050812764737325, "learning_rate": 3.573750613916319e-05, "loss": 0.5189, "step": 1652 }, { "epoch": 2.905096660808436, "grad_norm": 0.2862196340137482, "learning_rate": 3.568870147828963e-05, "loss": 0.5178, "step": 1653 }, { "epoch": 2.906854130052724, "grad_norm": 0.40547829356915516, "learning_rate": 3.5639903310153744e-05, "loss": 0.5143, "step": 1654 }, { "epoch": 2.9086115992970125, "grad_norm": 0.4600689317188986, "learning_rate": 3.559111170824468e-05, "loss": 0.5213, "step": 1655 }, { "epoch": 2.9103690685413004, "grad_norm": 0.37866841178744765, "learning_rate": 3.554232674604171e-05, "loss": 0.5215, "step": 1656 }, { "epoch": 2.9121265377855887, "grad_norm": 0.39003927959477974, "learning_rate": 3.54935484970141e-05, "loss": 0.5171, "step": 1657 }, { "epoch": 2.913884007029877, "grad_norm": 3.1092263077034916, "learning_rate": 3.5444777034620995e-05, "loss": 0.5372, "step": 1658 }, { "epoch": 2.9156414762741654, "grad_norm": 0.6245776178913824, "learning_rate": 3.539601243231136e-05, "loss": 0.5264, "step": 1659 }, { "epoch": 2.9173989455184532, "grad_norm": 0.7449408201428886, "learning_rate": 3.5347254763523813e-05, "loss": 0.4988, "step": 1660 }, { "epoch": 2.9191564147627416, "grad_norm": 0.36256737271163625, "learning_rate": 3.52985041016865e-05, "loss": 0.5099, "step": 1661 }, { "epoch": 2.92091388400703, "grad_norm": 0.4528014998819728, "learning_rate": 3.524976052021704e-05, "loss": 0.5146, "step": 1662 }, { "epoch": 2.922671353251318, "grad_norm": 0.5654657894772749, "learning_rate": 3.520102409252239e-05, "loss": 0.5135, "step": 1663 }, { "epoch": 2.9244288224956065, "grad_norm": 0.3702485188793454, "learning_rate": 3.515229489199873e-05, "loss": 0.5162, "step": 1664 }, { "epoch": 2.9261862917398944, "grad_norm": 0.3974272540727149, "learning_rate": 3.510357299203136e-05, "loss": 0.5181, "step": 1665 }, { "epoch": 2.9279437609841827, "grad_norm": 0.4471481389329429, "learning_rate": 3.5054858465994567e-05, "loss": 0.5118, "step": 1666 }, { "epoch": 2.929701230228471, "grad_norm": 0.2623973674516039, "learning_rate": 3.500615138725156e-05, "loss": 0.5153, "step": 1667 }, { "epoch": 2.9314586994727594, "grad_norm": 0.4470133310869647, "learning_rate": 3.49574518291543e-05, "loss": 0.5021, "step": 1668 }, { "epoch": 2.9332161687170473, "grad_norm": 0.3953295410141132, "learning_rate": 3.490875986504348e-05, "loss": 0.523, "step": 1669 }, { "epoch": 2.9349736379613356, "grad_norm": 0.313542698702647, "learning_rate": 3.4860075568248275e-05, "loss": 0.5225, "step": 1670 }, { "epoch": 2.936731107205624, "grad_norm": 0.3103431048099697, "learning_rate": 3.481139901208638e-05, "loss": 0.5282, "step": 1671 }, { "epoch": 2.9384885764499122, "grad_norm": 0.5135275988175928, "learning_rate": 3.4762730269863817e-05, "loss": 0.5179, "step": 1672 }, { "epoch": 2.9402460456942006, "grad_norm": 0.36575227203332616, "learning_rate": 3.4714069414874795e-05, "loss": 0.5159, "step": 1673 }, { "epoch": 2.9420035149384884, "grad_norm": 0.39195103455596747, "learning_rate": 3.466541652040172e-05, "loss": 0.5235, "step": 1674 }, { "epoch": 2.9437609841827768, "grad_norm": 0.35430386409744147, "learning_rate": 3.4616771659714956e-05, "loss": 0.5046, "step": 1675 }, { "epoch": 2.945518453427065, "grad_norm": 0.37331159442604656, "learning_rate": 3.456813490607279e-05, "loss": 0.5112, "step": 1676 }, { "epoch": 2.9472759226713534, "grad_norm": 0.3354698742651584, "learning_rate": 3.451950633272129e-05, "loss": 0.5151, "step": 1677 }, { "epoch": 2.9490333919156413, "grad_norm": 0.34022851243168606, "learning_rate": 3.44708860128942e-05, "loss": 0.5186, "step": 1678 }, { "epoch": 2.9507908611599296, "grad_norm": 0.38302164061285904, "learning_rate": 3.442227401981285e-05, "loss": 0.5003, "step": 1679 }, { "epoch": 2.952548330404218, "grad_norm": 0.4136332410657456, "learning_rate": 3.4373670426686034e-05, "loss": 0.5234, "step": 1680 }, { "epoch": 2.9543057996485063, "grad_norm": 0.2701576734636919, "learning_rate": 3.432507530670988e-05, "loss": 0.5195, "step": 1681 }, { "epoch": 2.9560632688927946, "grad_norm": 0.267945392971995, "learning_rate": 3.427648873306775e-05, "loss": 0.5073, "step": 1682 }, { "epoch": 2.9578207381370825, "grad_norm": 0.3104763062539055, "learning_rate": 3.422791077893014e-05, "loss": 0.518, "step": 1683 }, { "epoch": 2.959578207381371, "grad_norm": 0.4768790527240236, "learning_rate": 3.41793415174546e-05, "loss": 0.5238, "step": 1684 }, { "epoch": 2.961335676625659, "grad_norm": 0.3083945630382254, "learning_rate": 3.4130781021785535e-05, "loss": 0.5184, "step": 1685 }, { "epoch": 2.9630931458699474, "grad_norm": 0.258556485944084, "learning_rate": 3.408222936505419e-05, "loss": 0.5119, "step": 1686 }, { "epoch": 2.9648506151142353, "grad_norm": 0.26641440510688025, "learning_rate": 3.403368662037847e-05, "loss": 0.5167, "step": 1687 }, { "epoch": 2.9666080843585236, "grad_norm": 0.43143350216147014, "learning_rate": 3.398515286086286e-05, "loss": 0.5232, "step": 1688 }, { "epoch": 2.968365553602812, "grad_norm": 0.32564535902636155, "learning_rate": 3.393662815959836e-05, "loss": 0.5152, "step": 1689 }, { "epoch": 2.9701230228471003, "grad_norm": 0.25840167670050623, "learning_rate": 3.388811258966227e-05, "loss": 0.5193, "step": 1690 }, { "epoch": 2.9718804920913886, "grad_norm": 0.4064706226170321, "learning_rate": 3.3839606224118165e-05, "loss": 0.5218, "step": 1691 }, { "epoch": 2.9736379613356765, "grad_norm": 0.30745044148409273, "learning_rate": 3.379110913601576e-05, "loss": 0.5148, "step": 1692 }, { "epoch": 2.975395430579965, "grad_norm": 0.2573666539560483, "learning_rate": 3.374262139839076e-05, "loss": 0.5163, "step": 1693 }, { "epoch": 2.977152899824253, "grad_norm": 0.3888101887293525, "learning_rate": 3.3694143084264875e-05, "loss": 0.5164, "step": 1694 }, { "epoch": 2.9789103690685415, "grad_norm": 0.2448492299781427, "learning_rate": 3.364567426664555e-05, "loss": 0.517, "step": 1695 }, { "epoch": 2.9806678383128293, "grad_norm": 0.270286451599413, "learning_rate": 3.359721501852594e-05, "loss": 0.5176, "step": 1696 }, { "epoch": 2.9824253075571177, "grad_norm": 0.29863324536127034, "learning_rate": 3.354876541288479e-05, "loss": 0.5267, "step": 1697 }, { "epoch": 2.984182776801406, "grad_norm": 0.24120746132140555, "learning_rate": 3.3500325522686336e-05, "loss": 0.5213, "step": 1698 }, { "epoch": 2.9859402460456943, "grad_norm": 0.22792255586861615, "learning_rate": 3.3451895420880186e-05, "loss": 0.4989, "step": 1699 }, { "epoch": 2.9876977152899826, "grad_norm": 0.22931914371454992, "learning_rate": 3.3403475180401204e-05, "loss": 0.5247, "step": 1700 }, { "epoch": 2.9894551845342705, "grad_norm": 0.24146470413397594, "learning_rate": 3.335506487416939e-05, "loss": 0.5105, "step": 1701 }, { "epoch": 2.991212653778559, "grad_norm": 0.2153974255890474, "learning_rate": 3.3306664575089786e-05, "loss": 0.5213, "step": 1702 }, { "epoch": 2.992970123022847, "grad_norm": 0.23314849481078434, "learning_rate": 3.325827435605234e-05, "loss": 0.5157, "step": 1703 }, { "epoch": 2.9947275922671355, "grad_norm": 0.2173710059262764, "learning_rate": 3.32098942899319e-05, "loss": 0.5149, "step": 1704 }, { "epoch": 2.9964850615114234, "grad_norm": 0.2099275085452097, "learning_rate": 3.3161524449587954e-05, "loss": 0.5185, "step": 1705 }, { "epoch": 2.9982425307557117, "grad_norm": 0.24391537786067957, "learning_rate": 3.311316490786459e-05, "loss": 0.5163, "step": 1706 }, { "epoch": 3.0, "grad_norm": 0.2388842468209982, "learning_rate": 3.306481573759043e-05, "loss": 0.4977, "step": 1707 }, { "epoch": 3.0017574692442883, "grad_norm": 0.2889233103738066, "learning_rate": 3.3016477011578414e-05, "loss": 0.486, "step": 1708 }, { "epoch": 3.0035149384885766, "grad_norm": 0.26166083870105233, "learning_rate": 3.296814880262582e-05, "loss": 0.4829, "step": 1709 }, { "epoch": 3.0052724077328645, "grad_norm": 0.2849796138663078, "learning_rate": 3.291983118351405e-05, "loss": 0.4928, "step": 1710 }, { "epoch": 3.007029876977153, "grad_norm": 0.3196967637618999, "learning_rate": 3.287152422700857e-05, "loss": 0.4996, "step": 1711 }, { "epoch": 3.008787346221441, "grad_norm": 0.27183174817430267, "learning_rate": 3.282322800585877e-05, "loss": 0.4931, "step": 1712 }, { "epoch": 3.0105448154657295, "grad_norm": 0.43388817631275634, "learning_rate": 3.277494259279789e-05, "loss": 0.4899, "step": 1713 }, { "epoch": 3.0123022847100174, "grad_norm": 0.28179277407886166, "learning_rate": 3.27266680605429e-05, "loss": 0.4981, "step": 1714 }, { "epoch": 3.0140597539543057, "grad_norm": 0.35285471400503526, "learning_rate": 3.2678404481794366e-05, "loss": 0.4817, "step": 1715 }, { "epoch": 3.015817223198594, "grad_norm": 0.4008883498696981, "learning_rate": 3.263015192923637e-05, "loss": 0.4973, "step": 1716 }, { "epoch": 3.0175746924428823, "grad_norm": 0.3743216471178642, "learning_rate": 3.2581910475536366e-05, "loss": 0.4859, "step": 1717 }, { "epoch": 3.0193321616871707, "grad_norm": 0.32097327601498915, "learning_rate": 3.253368019334513e-05, "loss": 0.5007, "step": 1718 }, { "epoch": 3.0210896309314585, "grad_norm": 0.30433808493000053, "learning_rate": 3.24854611552966e-05, "loss": 0.5036, "step": 1719 }, { "epoch": 3.022847100175747, "grad_norm": 0.24224194652446732, "learning_rate": 3.2437253434007756e-05, "loss": 0.4876, "step": 1720 }, { "epoch": 3.024604569420035, "grad_norm": 0.27334234972024796, "learning_rate": 3.238905710207857e-05, "loss": 0.4928, "step": 1721 }, { "epoch": 3.0263620386643235, "grad_norm": 0.22556743339623192, "learning_rate": 3.234087223209184e-05, "loss": 0.4977, "step": 1722 }, { "epoch": 3.0281195079086114, "grad_norm": 0.2655900136352954, "learning_rate": 3.229269889661311e-05, "loss": 0.4877, "step": 1723 }, { "epoch": 3.0298769771528997, "grad_norm": 0.2660032893690447, "learning_rate": 3.224453716819056e-05, "loss": 0.4948, "step": 1724 }, { "epoch": 3.031634446397188, "grad_norm": 0.2627891979103179, "learning_rate": 3.219638711935488e-05, "loss": 0.481, "step": 1725 }, { "epoch": 3.0333919156414764, "grad_norm": 0.21584564926302388, "learning_rate": 3.2148248822619164e-05, "loss": 0.4764, "step": 1726 }, { "epoch": 3.0351493848857647, "grad_norm": 0.23457832433787812, "learning_rate": 3.210012235047882e-05, "loss": 0.4795, "step": 1727 }, { "epoch": 3.0369068541300526, "grad_norm": 0.27750721801700934, "learning_rate": 3.205200777541145e-05, "loss": 0.4955, "step": 1728 }, { "epoch": 3.038664323374341, "grad_norm": 0.21427761633445264, "learning_rate": 3.2003905169876734e-05, "loss": 0.4946, "step": 1729 }, { "epoch": 3.040421792618629, "grad_norm": 0.2659907617009313, "learning_rate": 3.195581460631633e-05, "loss": 0.4889, "step": 1730 }, { "epoch": 3.0421792618629175, "grad_norm": 0.22808747606283925, "learning_rate": 3.190773615715375e-05, "loss": 0.5052, "step": 1731 }, { "epoch": 3.0439367311072054, "grad_norm": 0.2815572290171488, "learning_rate": 3.185966989479427e-05, "loss": 0.5021, "step": 1732 }, { "epoch": 3.0456942003514937, "grad_norm": 0.21469092517268862, "learning_rate": 3.181161589162482e-05, "loss": 0.4856, "step": 1733 }, { "epoch": 3.047451669595782, "grad_norm": 0.2680195053380023, "learning_rate": 3.176357422001386e-05, "loss": 0.4889, "step": 1734 }, { "epoch": 3.0492091388400704, "grad_norm": 0.24031429049824185, "learning_rate": 3.171554495231127e-05, "loss": 0.5016, "step": 1735 }, { "epoch": 3.0509666080843587, "grad_norm": 0.2376183983568777, "learning_rate": 3.166752816084826e-05, "loss": 0.4957, "step": 1736 }, { "epoch": 3.0527240773286466, "grad_norm": 0.19949321879266377, "learning_rate": 3.161952391793726e-05, "loss": 0.4974, "step": 1737 }, { "epoch": 3.054481546572935, "grad_norm": 0.21321925191490487, "learning_rate": 3.1571532295871775e-05, "loss": 0.4894, "step": 1738 }, { "epoch": 3.0562390158172232, "grad_norm": 0.20875923054965836, "learning_rate": 3.152355336692634e-05, "loss": 0.4839, "step": 1739 }, { "epoch": 3.0579964850615116, "grad_norm": 0.2361236612027993, "learning_rate": 3.147558720335635e-05, "loss": 0.5031, "step": 1740 }, { "epoch": 3.0597539543057994, "grad_norm": 0.19197661577127245, "learning_rate": 3.142763387739798e-05, "loss": 0.4923, "step": 1741 }, { "epoch": 3.0615114235500878, "grad_norm": 0.23098616077794956, "learning_rate": 3.137969346126806e-05, "loss": 0.4987, "step": 1742 }, { "epoch": 3.063268892794376, "grad_norm": 0.21722285983115003, "learning_rate": 3.133176602716399e-05, "loss": 0.4951, "step": 1743 }, { "epoch": 3.0650263620386644, "grad_norm": 0.2407990578782504, "learning_rate": 3.128385164726366e-05, "loss": 0.4905, "step": 1744 }, { "epoch": 3.0667838312829527, "grad_norm": 0.19486567199959173, "learning_rate": 3.1235950393725214e-05, "loss": 0.5038, "step": 1745 }, { "epoch": 3.0685413005272406, "grad_norm": 0.2096690583989743, "learning_rate": 3.1188062338687096e-05, "loss": 0.504, "step": 1746 }, { "epoch": 3.070298769771529, "grad_norm": 0.1921554851073909, "learning_rate": 3.114018755426784e-05, "loss": 0.488, "step": 1747 }, { "epoch": 3.0720562390158173, "grad_norm": 0.21730922627603244, "learning_rate": 3.1092326112566e-05, "loss": 0.4985, "step": 1748 }, { "epoch": 3.0738137082601056, "grad_norm": 0.2513960365524484, "learning_rate": 3.1044478085660066e-05, "loss": 0.4851, "step": 1749 }, { "epoch": 3.0755711775043935, "grad_norm": 0.22278765690914507, "learning_rate": 3.0996643545608296e-05, "loss": 0.4861, "step": 1750 }, { "epoch": 3.077328646748682, "grad_norm": 0.242621674173873, "learning_rate": 3.094882256444863e-05, "loss": 0.4948, "step": 1751 }, { "epoch": 3.07908611599297, "grad_norm": 0.19245995245722927, "learning_rate": 3.090101521419861e-05, "loss": 0.4986, "step": 1752 }, { "epoch": 3.0808435852372584, "grad_norm": 0.2313559029714549, "learning_rate": 3.085322156685522e-05, "loss": 0.4916, "step": 1753 }, { "epoch": 3.0826010544815468, "grad_norm": 0.21490250331530733, "learning_rate": 3.0805441694394866e-05, "loss": 0.4923, "step": 1754 }, { "epoch": 3.0843585237258346, "grad_norm": 0.20295830259651235, "learning_rate": 3.075767566877316e-05, "loss": 0.4912, "step": 1755 }, { "epoch": 3.086115992970123, "grad_norm": 0.2785006248996036, "learning_rate": 3.070992356192487e-05, "loss": 0.5052, "step": 1756 }, { "epoch": 3.0878734622144113, "grad_norm": 0.2483964685463428, "learning_rate": 3.06621854457638e-05, "loss": 0.4981, "step": 1757 }, { "epoch": 3.0896309314586996, "grad_norm": 0.2635006659206068, "learning_rate": 3.0614461392182675e-05, "loss": 0.5099, "step": 1758 }, { "epoch": 3.0913884007029875, "grad_norm": 0.2925145512758609, "learning_rate": 3.0566751473053094e-05, "loss": 0.4948, "step": 1759 }, { "epoch": 3.093145869947276, "grad_norm": 0.24782325719081086, "learning_rate": 3.051905576022531e-05, "loss": 0.4875, "step": 1760 }, { "epoch": 3.094903339191564, "grad_norm": 0.2868177917252482, "learning_rate": 3.047137432552822e-05, "loss": 0.4952, "step": 1761 }, { "epoch": 3.0966608084358525, "grad_norm": 0.17181489161903504, "learning_rate": 3.0423707240769197e-05, "loss": 0.4793, "step": 1762 }, { "epoch": 3.0984182776801408, "grad_norm": 0.2552674142847942, "learning_rate": 3.0376054577733983e-05, "loss": 0.483, "step": 1763 }, { "epoch": 3.1001757469244287, "grad_norm": 0.23991830618958415, "learning_rate": 3.0328416408186664e-05, "loss": 0.4811, "step": 1764 }, { "epoch": 3.101933216168717, "grad_norm": 0.20327866784647658, "learning_rate": 3.028079280386945e-05, "loss": 0.4827, "step": 1765 }, { "epoch": 3.1036906854130053, "grad_norm": 0.19735599804048357, "learning_rate": 3.023318383650262e-05, "loss": 0.4824, "step": 1766 }, { "epoch": 3.1054481546572936, "grad_norm": 0.22116194377601267, "learning_rate": 3.018558957778443e-05, "loss": 0.487, "step": 1767 }, { "epoch": 3.1072056239015815, "grad_norm": 0.21549202955548422, "learning_rate": 3.0138010099390964e-05, "loss": 0.4875, "step": 1768 }, { "epoch": 3.10896309314587, "grad_norm": 0.22334037365195678, "learning_rate": 3.0090445472976065e-05, "loss": 0.5013, "step": 1769 }, { "epoch": 3.110720562390158, "grad_norm": 0.2381414434896002, "learning_rate": 3.0042895770171195e-05, "loss": 0.4986, "step": 1770 }, { "epoch": 3.1124780316344465, "grad_norm": 0.23225587222139546, "learning_rate": 2.9995361062585354e-05, "loss": 0.4936, "step": 1771 }, { "epoch": 3.114235500878735, "grad_norm": 0.20199125275362204, "learning_rate": 2.9947841421804946e-05, "loss": 0.4856, "step": 1772 }, { "epoch": 3.1159929701230227, "grad_norm": 0.2500789724215127, "learning_rate": 2.9900336919393682e-05, "loss": 0.4849, "step": 1773 }, { "epoch": 3.117750439367311, "grad_norm": 0.21545736944067295, "learning_rate": 2.9852847626892508e-05, "loss": 0.482, "step": 1774 }, { "epoch": 3.1195079086115993, "grad_norm": 0.2639977165855361, "learning_rate": 2.9805373615819425e-05, "loss": 0.4968, "step": 1775 }, { "epoch": 3.1212653778558876, "grad_norm": 0.2697421545636338, "learning_rate": 2.9757914957669433e-05, "loss": 0.4902, "step": 1776 }, { "epoch": 3.1230228471001755, "grad_norm": 0.21064004237263412, "learning_rate": 2.971047172391442e-05, "loss": 0.4841, "step": 1777 }, { "epoch": 3.124780316344464, "grad_norm": 0.3231973375656271, "learning_rate": 2.9663043986003032e-05, "loss": 0.5006, "step": 1778 }, { "epoch": 3.126537785588752, "grad_norm": 0.28876845077125063, "learning_rate": 2.9615631815360603e-05, "loss": 0.4978, "step": 1779 }, { "epoch": 3.1282952548330405, "grad_norm": 0.24502700356189225, "learning_rate": 2.9568235283388982e-05, "loss": 0.483, "step": 1780 }, { "epoch": 3.130052724077329, "grad_norm": 0.279841745056166, "learning_rate": 2.952085446146651e-05, "loss": 0.4877, "step": 1781 }, { "epoch": 3.1318101933216167, "grad_norm": 0.23656464785411171, "learning_rate": 2.947348942094783e-05, "loss": 0.5003, "step": 1782 }, { "epoch": 3.133567662565905, "grad_norm": 0.2624867210866497, "learning_rate": 2.9426140233163846e-05, "loss": 0.4873, "step": 1783 }, { "epoch": 3.1353251318101933, "grad_norm": 0.20316703628189253, "learning_rate": 2.937880696942158e-05, "loss": 0.4896, "step": 1784 }, { "epoch": 3.1370826010544817, "grad_norm": 0.23139990731126095, "learning_rate": 2.933148970100407e-05, "loss": 0.4837, "step": 1785 }, { "epoch": 3.1388400702987695, "grad_norm": 0.27211024246789156, "learning_rate": 2.928418849917027e-05, "loss": 0.5033, "step": 1786 }, { "epoch": 3.140597539543058, "grad_norm": 0.3024833131250496, "learning_rate": 2.9236903435154933e-05, "loss": 0.492, "step": 1787 }, { "epoch": 3.142355008787346, "grad_norm": 0.20306980548620168, "learning_rate": 2.91896345801685e-05, "loss": 0.5013, "step": 1788 }, { "epoch": 3.1441124780316345, "grad_norm": 0.2519316615140102, "learning_rate": 2.9142382005397035e-05, "loss": 0.4893, "step": 1789 }, { "epoch": 3.145869947275923, "grad_norm": 0.2698454439327999, "learning_rate": 2.909514578200205e-05, "loss": 0.4756, "step": 1790 }, { "epoch": 3.1476274165202107, "grad_norm": 0.24162774724837463, "learning_rate": 2.9047925981120443e-05, "loss": 0.4965, "step": 1791 }, { "epoch": 3.149384885764499, "grad_norm": 0.2447768874030989, "learning_rate": 2.9000722673864375e-05, "loss": 0.499, "step": 1792 }, { "epoch": 3.1511423550087874, "grad_norm": 0.24393824989915613, "learning_rate": 2.8953535931321168e-05, "loss": 0.4783, "step": 1793 }, { "epoch": 3.1528998242530757, "grad_norm": 0.22544554013533713, "learning_rate": 2.890636582455324e-05, "loss": 0.49, "step": 1794 }, { "epoch": 3.1546572934973636, "grad_norm": 0.25784184064715376, "learning_rate": 2.8859212424597883e-05, "loss": 0.5138, "step": 1795 }, { "epoch": 3.156414762741652, "grad_norm": 0.2245775497337154, "learning_rate": 2.881207580246728e-05, "loss": 0.4822, "step": 1796 }, { "epoch": 3.15817223198594, "grad_norm": 0.20279796032907346, "learning_rate": 2.8764956029148328e-05, "loss": 0.4954, "step": 1797 }, { "epoch": 3.1599297012302285, "grad_norm": 0.2625723428107103, "learning_rate": 2.871785317560254e-05, "loss": 0.5056, "step": 1798 }, { "epoch": 3.161687170474517, "grad_norm": 0.24689967556960632, "learning_rate": 2.867076731276599e-05, "loss": 0.4985, "step": 1799 }, { "epoch": 3.1634446397188047, "grad_norm": 0.2627204932718511, "learning_rate": 2.8623698511549133e-05, "loss": 0.4945, "step": 1800 }, { "epoch": 3.165202108963093, "grad_norm": 0.217768107008841, "learning_rate": 2.8576646842836715e-05, "loss": 0.4984, "step": 1801 }, { "epoch": 3.1669595782073814, "grad_norm": 0.30754197394482674, "learning_rate": 2.8529612377487712e-05, "loss": 0.4887, "step": 1802 }, { "epoch": 3.1687170474516697, "grad_norm": 0.23364393887542537, "learning_rate": 2.8482595186335156e-05, "loss": 0.4928, "step": 1803 }, { "epoch": 3.1704745166959576, "grad_norm": 0.19394435028557983, "learning_rate": 2.8435595340186117e-05, "loss": 0.499, "step": 1804 }, { "epoch": 3.172231985940246, "grad_norm": 0.21092206328311308, "learning_rate": 2.8388612909821515e-05, "loss": 0.4812, "step": 1805 }, { "epoch": 3.1739894551845342, "grad_norm": 0.20915096306428435, "learning_rate": 2.834164796599602e-05, "loss": 0.483, "step": 1806 }, { "epoch": 3.1757469244288226, "grad_norm": 0.18047886658251477, "learning_rate": 2.8294700579437996e-05, "loss": 0.4817, "step": 1807 }, { "epoch": 3.177504393673111, "grad_norm": 0.20066476968464522, "learning_rate": 2.8247770820849338e-05, "loss": 0.4821, "step": 1808 }, { "epoch": 3.1792618629173988, "grad_norm": 0.21774438695608025, "learning_rate": 2.820085876090546e-05, "loss": 0.4886, "step": 1809 }, { "epoch": 3.181019332161687, "grad_norm": 0.1824592114378444, "learning_rate": 2.8153964470255047e-05, "loss": 0.5078, "step": 1810 }, { "epoch": 3.1827768014059754, "grad_norm": 0.19297630778487723, "learning_rate": 2.8107088019520072e-05, "loss": 0.5045, "step": 1811 }, { "epoch": 3.1845342706502637, "grad_norm": 0.1805095040774105, "learning_rate": 2.8060229479295594e-05, "loss": 0.4842, "step": 1812 }, { "epoch": 3.1862917398945516, "grad_norm": 0.21475562348971913, "learning_rate": 2.8013388920149735e-05, "loss": 0.4886, "step": 1813 }, { "epoch": 3.18804920913884, "grad_norm": 0.1823072303159873, "learning_rate": 2.796656641262355e-05, "loss": 0.5013, "step": 1814 }, { "epoch": 3.1898066783831283, "grad_norm": 0.24514504414702706, "learning_rate": 2.791976202723088e-05, "loss": 0.4966, "step": 1815 }, { "epoch": 3.1915641476274166, "grad_norm": 0.22241257108198306, "learning_rate": 2.787297583445828e-05, "loss": 0.4882, "step": 1816 }, { "epoch": 3.193321616871705, "grad_norm": 0.20618679315622668, "learning_rate": 2.782620790476492e-05, "loss": 0.486, "step": 1817 }, { "epoch": 3.195079086115993, "grad_norm": 0.2301620113891946, "learning_rate": 2.7779458308582426e-05, "loss": 0.4941, "step": 1818 }, { "epoch": 3.196836555360281, "grad_norm": 0.18790574397026305, "learning_rate": 2.7732727116314886e-05, "loss": 0.4798, "step": 1819 }, { "epoch": 3.1985940246045694, "grad_norm": 0.2068211307638744, "learning_rate": 2.7686014398338606e-05, "loss": 0.4932, "step": 1820 }, { "epoch": 3.2003514938488578, "grad_norm": 0.21213380519648106, "learning_rate": 2.7639320225002108e-05, "loss": 0.496, "step": 1821 }, { "epoch": 3.202108963093146, "grad_norm": 0.1854518116516676, "learning_rate": 2.7592644666625966e-05, "loss": 0.5005, "step": 1822 }, { "epoch": 3.203866432337434, "grad_norm": 0.25645520926783266, "learning_rate": 2.7545987793502723e-05, "loss": 0.4873, "step": 1823 }, { "epoch": 3.2056239015817223, "grad_norm": 0.20460097738578062, "learning_rate": 2.7499349675896795e-05, "loss": 0.4917, "step": 1824 }, { "epoch": 3.2073813708260106, "grad_norm": 0.22957806457459182, "learning_rate": 2.7452730384044346e-05, "loss": 0.4942, "step": 1825 }, { "epoch": 3.209138840070299, "grad_norm": 0.22007744904365834, "learning_rate": 2.7406129988153184e-05, "loss": 0.483, "step": 1826 }, { "epoch": 3.210896309314587, "grad_norm": 0.2053924135533108, "learning_rate": 2.7359548558402655e-05, "loss": 0.4814, "step": 1827 }, { "epoch": 3.212653778558875, "grad_norm": 0.20801038978345204, "learning_rate": 2.7312986164943552e-05, "loss": 0.4876, "step": 1828 }, { "epoch": 3.2144112478031635, "grad_norm": 0.25057255314681914, "learning_rate": 2.7266442877898013e-05, "loss": 0.497, "step": 1829 }, { "epoch": 3.2161687170474518, "grad_norm": 0.24122581613033203, "learning_rate": 2.7219918767359374e-05, "loss": 0.5017, "step": 1830 }, { "epoch": 3.21792618629174, "grad_norm": 0.22680003953783176, "learning_rate": 2.7173413903392113e-05, "loss": 0.4856, "step": 1831 }, { "epoch": 3.219683655536028, "grad_norm": 0.25338449345263264, "learning_rate": 2.712692835603171e-05, "loss": 0.4952, "step": 1832 }, { "epoch": 3.2214411247803163, "grad_norm": 0.23275580122726358, "learning_rate": 2.7080462195284546e-05, "loss": 0.5077, "step": 1833 }, { "epoch": 3.2231985940246046, "grad_norm": 0.23107060171027488, "learning_rate": 2.7034015491127843e-05, "loss": 0.499, "step": 1834 }, { "epoch": 3.224956063268893, "grad_norm": 0.2512380457379978, "learning_rate": 2.698758831350949e-05, "loss": 0.4967, "step": 1835 }, { "epoch": 3.226713532513181, "grad_norm": 0.3187426536691022, "learning_rate": 2.694118073234797e-05, "loss": 0.4844, "step": 1836 }, { "epoch": 3.228471001757469, "grad_norm": 0.2371098328039936, "learning_rate": 2.6894792817532267e-05, "loss": 0.4986, "step": 1837 }, { "epoch": 3.2302284710017575, "grad_norm": 0.26393622281846896, "learning_rate": 2.6848424638921734e-05, "loss": 0.4919, "step": 1838 }, { "epoch": 3.231985940246046, "grad_norm": 0.24791203493717093, "learning_rate": 2.680207626634603e-05, "loss": 0.492, "step": 1839 }, { "epoch": 3.233743409490334, "grad_norm": 0.2041083768819497, "learning_rate": 2.6755747769604948e-05, "loss": 0.4821, "step": 1840 }, { "epoch": 3.235500878734622, "grad_norm": 0.22375829811713902, "learning_rate": 2.6709439218468376e-05, "loss": 0.4896, "step": 1841 }, { "epoch": 3.2372583479789103, "grad_norm": 0.23767011035985453, "learning_rate": 2.6663150682676157e-05, "loss": 0.4918, "step": 1842 }, { "epoch": 3.2390158172231986, "grad_norm": 0.2183338015382699, "learning_rate": 2.6616882231937967e-05, "loss": 0.4847, "step": 1843 }, { "epoch": 3.240773286467487, "grad_norm": 0.19750914429769598, "learning_rate": 2.6570633935933286e-05, "loss": 0.4994, "step": 1844 }, { "epoch": 3.242530755711775, "grad_norm": 0.21303691896998503, "learning_rate": 2.6524405864311203e-05, "loss": 0.486, "step": 1845 }, { "epoch": 3.244288224956063, "grad_norm": 0.18724797392391804, "learning_rate": 2.6478198086690357e-05, "loss": 0.4922, "step": 1846 }, { "epoch": 3.2460456942003515, "grad_norm": 0.23403604348477686, "learning_rate": 2.6432010672658822e-05, "loss": 0.492, "step": 1847 }, { "epoch": 3.24780316344464, "grad_norm": 0.21162410811826646, "learning_rate": 2.6385843691773995e-05, "loss": 0.5037, "step": 1848 }, { "epoch": 3.249560632688928, "grad_norm": 0.27574771816977706, "learning_rate": 2.6339697213562556e-05, "loss": 0.5024, "step": 1849 }, { "epoch": 3.251318101933216, "grad_norm": 0.26258592492599586, "learning_rate": 2.6293571307520234e-05, "loss": 0.4878, "step": 1850 }, { "epoch": 3.2530755711775043, "grad_norm": 0.2764686101164582, "learning_rate": 2.6247466043111818e-05, "loss": 0.4953, "step": 1851 }, { "epoch": 3.2548330404217927, "grad_norm": 0.21063891593010198, "learning_rate": 2.6201381489771002e-05, "loss": 0.4854, "step": 1852 }, { "epoch": 3.256590509666081, "grad_norm": 0.24225963412232937, "learning_rate": 2.6155317716900274e-05, "loss": 0.5041, "step": 1853 }, { "epoch": 3.2583479789103693, "grad_norm": 0.1941385848305529, "learning_rate": 2.6109274793870886e-05, "loss": 0.4821, "step": 1854 }, { "epoch": 3.260105448154657, "grad_norm": 0.20687635728250428, "learning_rate": 2.6063252790022623e-05, "loss": 0.4958, "step": 1855 }, { "epoch": 3.2618629173989455, "grad_norm": 0.1971640333173726, "learning_rate": 2.601725177466379e-05, "loss": 0.4965, "step": 1856 }, { "epoch": 3.263620386643234, "grad_norm": 0.2001705686291728, "learning_rate": 2.5971271817071085e-05, "loss": 0.4795, "step": 1857 }, { "epoch": 3.2653778558875217, "grad_norm": 0.22247778814095956, "learning_rate": 2.592531298648948e-05, "loss": 0.4963, "step": 1858 }, { "epoch": 3.26713532513181, "grad_norm": 0.21182574766339313, "learning_rate": 2.5879375352132162e-05, "loss": 0.4865, "step": 1859 }, { "epoch": 3.2688927943760984, "grad_norm": 0.19918176913076283, "learning_rate": 2.5833458983180382e-05, "loss": 0.4853, "step": 1860 }, { "epoch": 3.2706502636203867, "grad_norm": 0.18084283455764882, "learning_rate": 2.578756394878337e-05, "loss": 0.4877, "step": 1861 }, { "epoch": 3.272407732864675, "grad_norm": 0.21037307839873642, "learning_rate": 2.5741690318058182e-05, "loss": 0.4911, "step": 1862 }, { "epoch": 3.2741652021089633, "grad_norm": 0.2128907937319824, "learning_rate": 2.5695838160089687e-05, "loss": 0.494, "step": 1863 }, { "epoch": 3.275922671353251, "grad_norm": 0.19334853621385145, "learning_rate": 2.565000754393043e-05, "loss": 0.4855, "step": 1864 }, { "epoch": 3.2776801405975395, "grad_norm": 0.24522837113173593, "learning_rate": 2.5604198538600474e-05, "loss": 0.4918, "step": 1865 }, { "epoch": 3.279437609841828, "grad_norm": 0.2538721777940304, "learning_rate": 2.5558411213087363e-05, "loss": 0.4959, "step": 1866 }, { "epoch": 3.281195079086116, "grad_norm": 0.16703431430606236, "learning_rate": 2.5512645636345977e-05, "loss": 0.5045, "step": 1867 }, { "epoch": 3.282952548330404, "grad_norm": 0.2658473643563442, "learning_rate": 2.5466901877298425e-05, "loss": 0.5074, "step": 1868 }, { "epoch": 3.2847100175746924, "grad_norm": 0.23514125175660183, "learning_rate": 2.5421180004834015e-05, "loss": 0.4932, "step": 1869 }, { "epoch": 3.2864674868189807, "grad_norm": 0.1741758385211123, "learning_rate": 2.537548008780905e-05, "loss": 0.4974, "step": 1870 }, { "epoch": 3.288224956063269, "grad_norm": 0.2512992257517276, "learning_rate": 2.532980219504677e-05, "loss": 0.4903, "step": 1871 }, { "epoch": 3.2899824253075574, "grad_norm": 0.22606300171825877, "learning_rate": 2.5284146395337273e-05, "loss": 0.4729, "step": 1872 }, { "epoch": 3.2917398945518452, "grad_norm": 0.21651014393317716, "learning_rate": 2.5238512757437345e-05, "loss": 0.5095, "step": 1873 }, { "epoch": 3.2934973637961336, "grad_norm": 0.30913324255964614, "learning_rate": 2.5192901350070437e-05, "loss": 0.4955, "step": 1874 }, { "epoch": 3.295254833040422, "grad_norm": 0.1986375288117601, "learning_rate": 2.5147312241926503e-05, "loss": 0.482, "step": 1875 }, { "epoch": 3.29701230228471, "grad_norm": 0.216319714480977, "learning_rate": 2.510174550166191e-05, "loss": 0.4884, "step": 1876 }, { "epoch": 3.298769771528998, "grad_norm": 0.26969089372940913, "learning_rate": 2.505620119789935e-05, "loss": 0.4915, "step": 1877 }, { "epoch": 3.3005272407732864, "grad_norm": 0.1930445002779088, "learning_rate": 2.5010679399227716e-05, "loss": 0.5069, "step": 1878 }, { "epoch": 3.3022847100175747, "grad_norm": 0.19073939928292732, "learning_rate": 2.4965180174202024e-05, "loss": 0.4913, "step": 1879 }, { "epoch": 3.304042179261863, "grad_norm": 0.16043621083305132, "learning_rate": 2.491970359134327e-05, "loss": 0.4978, "step": 1880 }, { "epoch": 3.3057996485061514, "grad_norm": 0.22878858277246591, "learning_rate": 2.4874249719138383e-05, "loss": 0.4867, "step": 1881 }, { "epoch": 3.3075571177504393, "grad_norm": 0.17630673391582502, "learning_rate": 2.4828818626040067e-05, "loss": 0.479, "step": 1882 }, { "epoch": 3.3093145869947276, "grad_norm": 0.2066371532824389, "learning_rate": 2.4783410380466713e-05, "loss": 0.4872, "step": 1883 }, { "epoch": 3.311072056239016, "grad_norm": 0.1627974909229151, "learning_rate": 2.473802505080233e-05, "loss": 0.4893, "step": 1884 }, { "epoch": 3.3128295254833042, "grad_norm": 0.22679338856222894, "learning_rate": 2.4692662705396412e-05, "loss": 0.4936, "step": 1885 }, { "epoch": 3.314586994727592, "grad_norm": 0.1825887299920752, "learning_rate": 2.4647323412563823e-05, "loss": 0.4978, "step": 1886 }, { "epoch": 3.3163444639718804, "grad_norm": 0.19143457333462183, "learning_rate": 2.460200724058472e-05, "loss": 0.4937, "step": 1887 }, { "epoch": 3.3181019332161688, "grad_norm": 0.19684993027122966, "learning_rate": 2.4556714257704428e-05, "loss": 0.4917, "step": 1888 }, { "epoch": 3.319859402460457, "grad_norm": 0.2134450189642843, "learning_rate": 2.451144453213338e-05, "loss": 0.4828, "step": 1889 }, { "epoch": 3.3216168717047454, "grad_norm": 0.18392067667411302, "learning_rate": 2.4466198132046955e-05, "loss": 0.4943, "step": 1890 }, { "epoch": 3.3233743409490333, "grad_norm": 0.2316016345162589, "learning_rate": 2.442097512558541e-05, "loss": 0.4959, "step": 1891 }, { "epoch": 3.3251318101933216, "grad_norm": 0.20383761588609958, "learning_rate": 2.4375775580853785e-05, "loss": 0.4888, "step": 1892 }, { "epoch": 3.32688927943761, "grad_norm": 0.1781620377290269, "learning_rate": 2.4330599565921765e-05, "loss": 0.4841, "step": 1893 }, { "epoch": 3.3286467486818982, "grad_norm": 0.20518790109550186, "learning_rate": 2.4285447148823626e-05, "loss": 0.4981, "step": 1894 }, { "epoch": 3.330404217926186, "grad_norm": 0.18911061780891533, "learning_rate": 2.4240318397558083e-05, "loss": 0.4992, "step": 1895 }, { "epoch": 3.3321616871704745, "grad_norm": 0.1619798874979453, "learning_rate": 2.4195213380088224e-05, "loss": 0.4812, "step": 1896 }, { "epoch": 3.3339191564147628, "grad_norm": 0.17794361288992977, "learning_rate": 2.4150132164341385e-05, "loss": 0.4994, "step": 1897 }, { "epoch": 3.335676625659051, "grad_norm": 0.18911197751998826, "learning_rate": 2.410507481820906e-05, "loss": 0.4832, "step": 1898 }, { "epoch": 3.3374340949033394, "grad_norm": 0.188672429755467, "learning_rate": 2.4060041409546812e-05, "loss": 0.495, "step": 1899 }, { "epoch": 3.3391915641476273, "grad_norm": 0.20353488139826223, "learning_rate": 2.4015032006174135e-05, "loss": 0.4986, "step": 1900 }, { "epoch": 3.3409490333919156, "grad_norm": 0.20335602604714714, "learning_rate": 2.397004667587437e-05, "loss": 0.4885, "step": 1901 }, { "epoch": 3.342706502636204, "grad_norm": 0.17799645922683938, "learning_rate": 2.3925085486394614e-05, "loss": 0.4857, "step": 1902 }, { "epoch": 3.3444639718804923, "grad_norm": 0.20852884757296342, "learning_rate": 2.3880148505445596e-05, "loss": 0.4842, "step": 1903 }, { "epoch": 3.34622144112478, "grad_norm": 0.1733770150388712, "learning_rate": 2.383523580070163e-05, "loss": 0.4964, "step": 1904 }, { "epoch": 3.3479789103690685, "grad_norm": 0.17039439666605058, "learning_rate": 2.379034743980041e-05, "loss": 0.4946, "step": 1905 }, { "epoch": 3.349736379613357, "grad_norm": 0.16184567258864047, "learning_rate": 2.3745483490343e-05, "loss": 0.486, "step": 1906 }, { "epoch": 3.351493848857645, "grad_norm": 0.16295798374923942, "learning_rate": 2.370064401989371e-05, "loss": 0.5094, "step": 1907 }, { "epoch": 3.3532513181019334, "grad_norm": 0.16845332892813988, "learning_rate": 2.3655829095979933e-05, "loss": 0.4842, "step": 1908 }, { "epoch": 3.3550087873462213, "grad_norm": 0.16420925147568258, "learning_rate": 2.3611038786092185e-05, "loss": 0.4926, "step": 1909 }, { "epoch": 3.3567662565905096, "grad_norm": 0.17065349201573113, "learning_rate": 2.356627315768385e-05, "loss": 0.502, "step": 1910 }, { "epoch": 3.358523725834798, "grad_norm": 0.1635212559461496, "learning_rate": 2.3521532278171142e-05, "loss": 0.4956, "step": 1911 }, { "epoch": 3.3602811950790863, "grad_norm": 0.1629732502521611, "learning_rate": 2.3476816214933017e-05, "loss": 0.4955, "step": 1912 }, { "epoch": 3.362038664323374, "grad_norm": 0.1703924748428273, "learning_rate": 2.3432125035311046e-05, "loss": 0.5065, "step": 1913 }, { "epoch": 3.3637961335676625, "grad_norm": 0.17176118944637206, "learning_rate": 2.3387458806609375e-05, "loss": 0.4857, "step": 1914 }, { "epoch": 3.365553602811951, "grad_norm": 0.18115984683500852, "learning_rate": 2.3342817596094523e-05, "loss": 0.4832, "step": 1915 }, { "epoch": 3.367311072056239, "grad_norm": 0.17568362365374735, "learning_rate": 2.3298201470995333e-05, "loss": 0.4902, "step": 1916 }, { "epoch": 3.3690685413005275, "grad_norm": 0.15546362773331876, "learning_rate": 2.325361049850289e-05, "loss": 0.4897, "step": 1917 }, { "epoch": 3.3708260105448153, "grad_norm": 0.1942745315672467, "learning_rate": 2.320904474577038e-05, "loss": 0.4916, "step": 1918 }, { "epoch": 3.3725834797891037, "grad_norm": 0.1527824924883889, "learning_rate": 2.3164504279913045e-05, "loss": 0.4828, "step": 1919 }, { "epoch": 3.374340949033392, "grad_norm": 0.15150550750664074, "learning_rate": 2.3119989168008018e-05, "loss": 0.4892, "step": 1920 }, { "epoch": 3.3760984182776803, "grad_norm": 0.16282708718997108, "learning_rate": 2.307549947709424e-05, "loss": 0.4949, "step": 1921 }, { "epoch": 3.377855887521968, "grad_norm": 0.16336886597895492, "learning_rate": 2.303103527417239e-05, "loss": 0.4998, "step": 1922 }, { "epoch": 3.3796133567662565, "grad_norm": 0.17234905285854024, "learning_rate": 2.298659662620475e-05, "loss": 0.4938, "step": 1923 }, { "epoch": 3.381370826010545, "grad_norm": 0.17972111033253652, "learning_rate": 2.2942183600115127e-05, "loss": 0.4795, "step": 1924 }, { "epoch": 3.383128295254833, "grad_norm": 0.1714584260687188, "learning_rate": 2.2897796262788728e-05, "loss": 0.4948, "step": 1925 }, { "epoch": 3.3848857644991215, "grad_norm": 0.20461649465763945, "learning_rate": 2.2853434681072085e-05, "loss": 0.4971, "step": 1926 }, { "epoch": 3.3866432337434094, "grad_norm": 0.16915606519799997, "learning_rate": 2.2809098921772935e-05, "loss": 0.4907, "step": 1927 }, { "epoch": 3.3884007029876977, "grad_norm": 0.23343472417781597, "learning_rate": 2.2764789051660115e-05, "loss": 0.4979, "step": 1928 }, { "epoch": 3.390158172231986, "grad_norm": 0.19684379221689763, "learning_rate": 2.2720505137463528e-05, "loss": 0.5039, "step": 1929 }, { "epoch": 3.3919156414762743, "grad_norm": 0.17973711613235613, "learning_rate": 2.2676247245873925e-05, "loss": 0.4919, "step": 1930 }, { "epoch": 3.393673110720562, "grad_norm": 0.1902595445216233, "learning_rate": 2.2632015443542894e-05, "loss": 0.4909, "step": 1931 }, { "epoch": 3.3954305799648505, "grad_norm": 0.17723128891694137, "learning_rate": 2.2587809797082722e-05, "loss": 0.4978, "step": 1932 }, { "epoch": 3.397188049209139, "grad_norm": 0.1633555240446606, "learning_rate": 2.254363037306631e-05, "loss": 0.4921, "step": 1933 }, { "epoch": 3.398945518453427, "grad_norm": 0.18508425782979235, "learning_rate": 2.2499477238027098e-05, "loss": 0.4887, "step": 1934 }, { "epoch": 3.4007029876977155, "grad_norm": 0.14845495904037334, "learning_rate": 2.2455350458458905e-05, "loss": 0.5051, "step": 1935 }, { "epoch": 3.4024604569420034, "grad_norm": 0.18382012984526788, "learning_rate": 2.241125010081584e-05, "loss": 0.4883, "step": 1936 }, { "epoch": 3.4042179261862917, "grad_norm": 0.1856901488387322, "learning_rate": 2.2367176231512254e-05, "loss": 0.4891, "step": 1937 }, { "epoch": 3.40597539543058, "grad_norm": 0.23657324734757249, "learning_rate": 2.232312891692259e-05, "loss": 0.4857, "step": 1938 }, { "epoch": 3.4077328646748684, "grad_norm": 0.19721754761487725, "learning_rate": 2.2279108223381335e-05, "loss": 0.4841, "step": 1939 }, { "epoch": 3.4094903339191562, "grad_norm": 0.24143447294118936, "learning_rate": 2.2235114217182837e-05, "loss": 0.4942, "step": 1940 }, { "epoch": 3.4112478031634446, "grad_norm": 0.2616275401550436, "learning_rate": 2.219114696458128e-05, "loss": 0.4999, "step": 1941 }, { "epoch": 3.413005272407733, "grad_norm": 0.19093612267996984, "learning_rate": 2.2147206531790547e-05, "loss": 0.5094, "step": 1942 }, { "epoch": 3.414762741652021, "grad_norm": 0.25177139642021784, "learning_rate": 2.2103292984984125e-05, "loss": 0.4952, "step": 1943 }, { "epoch": 3.4165202108963095, "grad_norm": 0.1811656061472481, "learning_rate": 2.2059406390295056e-05, "loss": 0.4972, "step": 1944 }, { "epoch": 3.4182776801405974, "grad_norm": 0.24307567640487326, "learning_rate": 2.2015546813815732e-05, "loss": 0.4926, "step": 1945 }, { "epoch": 3.4200351493848857, "grad_norm": 0.19955270389328753, "learning_rate": 2.1971714321597895e-05, "loss": 0.4923, "step": 1946 }, { "epoch": 3.421792618629174, "grad_norm": 0.1623698081670924, "learning_rate": 2.1927908979652495e-05, "loss": 0.5005, "step": 1947 }, { "epoch": 3.4235500878734624, "grad_norm": 0.1670083575129648, "learning_rate": 2.1884130853949532e-05, "loss": 0.4974, "step": 1948 }, { "epoch": 3.4253075571177503, "grad_norm": 0.2006857419510839, "learning_rate": 2.1840380010418133e-05, "loss": 0.5183, "step": 1949 }, { "epoch": 3.4270650263620386, "grad_norm": 0.1534943655747408, "learning_rate": 2.1796656514946254e-05, "loss": 0.5108, "step": 1950 }, { "epoch": 3.428822495606327, "grad_norm": 0.15687524703912376, "learning_rate": 2.1752960433380688e-05, "loss": 0.4941, "step": 1951 }, { "epoch": 3.4305799648506152, "grad_norm": 0.14977145938821718, "learning_rate": 2.170929183152695e-05, "loss": 0.5055, "step": 1952 }, { "epoch": 3.4323374340949035, "grad_norm": 0.1788117715763678, "learning_rate": 2.1665650775149142e-05, "loss": 0.4872, "step": 1953 }, { "epoch": 3.4340949033391914, "grad_norm": 0.1630030044638992, "learning_rate": 2.1622037329969947e-05, "loss": 0.4895, "step": 1954 }, { "epoch": 3.4358523725834798, "grad_norm": 0.15894144395640483, "learning_rate": 2.15784515616704e-05, "loss": 0.4901, "step": 1955 }, { "epoch": 3.437609841827768, "grad_norm": 0.15180477198744935, "learning_rate": 2.153489353588988e-05, "loss": 0.5019, "step": 1956 }, { "epoch": 3.4393673110720564, "grad_norm": 0.16412852129691646, "learning_rate": 2.1491363318226e-05, "loss": 0.4917, "step": 1957 }, { "epoch": 3.4411247803163443, "grad_norm": 0.14573785010309692, "learning_rate": 2.1447860974234466e-05, "loss": 0.4994, "step": 1958 }, { "epoch": 3.4428822495606326, "grad_norm": 0.149674114938567, "learning_rate": 2.1404386569429035e-05, "loss": 0.4918, "step": 1959 }, { "epoch": 3.444639718804921, "grad_norm": 0.14665639295793245, "learning_rate": 2.1360940169281364e-05, "loss": 0.4905, "step": 1960 }, { "epoch": 3.4463971880492092, "grad_norm": 0.13168452150502152, "learning_rate": 2.1317521839220947e-05, "loss": 0.4747, "step": 1961 }, { "epoch": 3.4481546572934976, "grad_norm": 0.1585526243776926, "learning_rate": 2.127413164463501e-05, "loss": 0.4931, "step": 1962 }, { "epoch": 3.4499121265377855, "grad_norm": 0.13394615598992285, "learning_rate": 2.1230769650868382e-05, "loss": 0.481, "step": 1963 }, { "epoch": 3.4516695957820738, "grad_norm": 0.16530976147729265, "learning_rate": 2.1187435923223482e-05, "loss": 0.4956, "step": 1964 }, { "epoch": 3.453427065026362, "grad_norm": 0.16717879638236544, "learning_rate": 2.1144130526960093e-05, "loss": 0.4921, "step": 1965 }, { "epoch": 3.4551845342706504, "grad_norm": 0.15711873883606217, "learning_rate": 2.1100853527295372e-05, "loss": 0.4846, "step": 1966 }, { "epoch": 3.4569420035149383, "grad_norm": 0.18353958858437608, "learning_rate": 2.1057604989403682e-05, "loss": 0.4822, "step": 1967 }, { "epoch": 3.4586994727592266, "grad_norm": 0.17369752881083905, "learning_rate": 2.1014384978416552e-05, "loss": 0.501, "step": 1968 }, { "epoch": 3.460456942003515, "grad_norm": 0.20597636127205704, "learning_rate": 2.0971193559422535e-05, "loss": 0.4988, "step": 1969 }, { "epoch": 3.4622144112478033, "grad_norm": 0.16960650158446355, "learning_rate": 2.0928030797467126e-05, "loss": 0.5097, "step": 1970 }, { "epoch": 3.4639718804920916, "grad_norm": 0.24890111957876015, "learning_rate": 2.088489675755266e-05, "loss": 0.4979, "step": 1971 }, { "epoch": 3.4657293497363795, "grad_norm": 0.15624192751095473, "learning_rate": 2.0841791504638235e-05, "loss": 0.5, "step": 1972 }, { "epoch": 3.467486818980668, "grad_norm": 0.23672834740261367, "learning_rate": 2.0798715103639555e-05, "loss": 0.5007, "step": 1973 }, { "epoch": 3.469244288224956, "grad_norm": 0.13725064898597733, "learning_rate": 2.075566761942893e-05, "loss": 0.4959, "step": 1974 }, { "epoch": 3.4710017574692444, "grad_norm": 0.1847438337401723, "learning_rate": 2.0712649116835096e-05, "loss": 0.4865, "step": 1975 }, { "epoch": 3.4727592267135323, "grad_norm": 0.14738272585245832, "learning_rate": 2.066965966064312e-05, "loss": 0.4976, "step": 1976 }, { "epoch": 3.4745166959578206, "grad_norm": 0.1707607625576542, "learning_rate": 2.0626699315594357e-05, "loss": 0.497, "step": 1977 }, { "epoch": 3.476274165202109, "grad_norm": 0.15235348748647173, "learning_rate": 2.0583768146386304e-05, "loss": 0.4832, "step": 1978 }, { "epoch": 3.4780316344463973, "grad_norm": 0.17271540650783102, "learning_rate": 2.0540866217672538e-05, "loss": 0.4832, "step": 1979 }, { "epoch": 3.4797891036906856, "grad_norm": 0.17205478106966066, "learning_rate": 2.049799359406257e-05, "loss": 0.4962, "step": 1980 }, { "epoch": 3.4815465729349735, "grad_norm": 0.1564146116949818, "learning_rate": 2.0455150340121818e-05, "loss": 0.4896, "step": 1981 }, { "epoch": 3.483304042179262, "grad_norm": 0.1536730000558837, "learning_rate": 2.0412336520371428e-05, "loss": 0.493, "step": 1982 }, { "epoch": 3.48506151142355, "grad_norm": 0.17787970323328411, "learning_rate": 2.0369552199288236e-05, "loss": 0.4962, "step": 1983 }, { "epoch": 3.4868189806678385, "grad_norm": 0.14938583488825186, "learning_rate": 2.032679744130468e-05, "loss": 0.4956, "step": 1984 }, { "epoch": 3.4885764499121263, "grad_norm": 0.1706420107025884, "learning_rate": 2.0284072310808645e-05, "loss": 0.5099, "step": 1985 }, { "epoch": 3.4903339191564147, "grad_norm": 0.14749286266477302, "learning_rate": 2.0241376872143395e-05, "loss": 0.4997, "step": 1986 }, { "epoch": 3.492091388400703, "grad_norm": 0.1469256428442001, "learning_rate": 2.01987111896075e-05, "loss": 0.4899, "step": 1987 }, { "epoch": 3.4938488576449913, "grad_norm": 0.15021444023267916, "learning_rate": 2.0156075327454684e-05, "loss": 0.4818, "step": 1988 }, { "epoch": 3.4956063268892796, "grad_norm": 0.16206484435202423, "learning_rate": 2.0113469349893818e-05, "loss": 0.4894, "step": 1989 }, { "epoch": 3.4973637961335675, "grad_norm": 0.15828547960739262, "learning_rate": 2.0070893321088735e-05, "loss": 0.5023, "step": 1990 }, { "epoch": 3.499121265377856, "grad_norm": 0.16077168137330464, "learning_rate": 2.002834730515814e-05, "loss": 0.4909, "step": 1991 }, { "epoch": 3.500878734622144, "grad_norm": 0.1782666614165153, "learning_rate": 1.9985831366175564e-05, "loss": 0.5009, "step": 1992 }, { "epoch": 3.5026362038664325, "grad_norm": 0.1488162246332347, "learning_rate": 1.9943345568169243e-05, "loss": 0.4879, "step": 1993 }, { "epoch": 3.5043936731107204, "grad_norm": 0.18589340406676866, "learning_rate": 1.9900889975122043e-05, "loss": 0.4971, "step": 1994 }, { "epoch": 3.5061511423550087, "grad_norm": 0.15158884734965036, "learning_rate": 1.9858464650971303e-05, "loss": 0.5031, "step": 1995 }, { "epoch": 3.507908611599297, "grad_norm": 0.1976443126190284, "learning_rate": 1.98160696596088e-05, "loss": 0.4962, "step": 1996 }, { "epoch": 3.5096660808435853, "grad_norm": 0.15936332614713652, "learning_rate": 1.9773705064880624e-05, "loss": 0.488, "step": 1997 }, { "epoch": 3.5114235500878737, "grad_norm": 0.21136552742910472, "learning_rate": 1.9731370930587076e-05, "loss": 0.4996, "step": 1998 }, { "epoch": 3.5131810193321615, "grad_norm": 0.14634959301337683, "learning_rate": 1.9689067320482617e-05, "loss": 0.4878, "step": 1999 }, { "epoch": 3.51493848857645, "grad_norm": 0.18650678949958388, "learning_rate": 1.9646794298275717e-05, "loss": 0.4954, "step": 2000 }, { "epoch": 3.516695957820738, "grad_norm": 0.16663077601066634, "learning_rate": 1.9604551927628774e-05, "loss": 0.4935, "step": 2001 }, { "epoch": 3.5184534270650265, "grad_norm": 0.1827019736748605, "learning_rate": 1.956234027215806e-05, "loss": 0.4799, "step": 2002 }, { "epoch": 3.5202108963093144, "grad_norm": 0.18903690104534435, "learning_rate": 1.9520159395433507e-05, "loss": 0.4922, "step": 2003 }, { "epoch": 3.5219683655536027, "grad_norm": 0.16358599098112905, "learning_rate": 1.947800936097881e-05, "loss": 0.5071, "step": 2004 }, { "epoch": 3.523725834797891, "grad_norm": 0.16133471197309793, "learning_rate": 1.9435890232271136e-05, "loss": 0.4902, "step": 2005 }, { "epoch": 3.5254833040421794, "grad_norm": 0.14899854457180772, "learning_rate": 1.939380207274114e-05, "loss": 0.4882, "step": 2006 }, { "epoch": 3.5272407732864677, "grad_norm": 0.17080379171383858, "learning_rate": 1.9351744945772818e-05, "loss": 0.4938, "step": 2007 }, { "epoch": 3.5289982425307556, "grad_norm": 0.1653513397575772, "learning_rate": 1.9309718914703438e-05, "loss": 0.4948, "step": 2008 }, { "epoch": 3.530755711775044, "grad_norm": 0.18659227678181922, "learning_rate": 1.926772404282347e-05, "loss": 0.4843, "step": 2009 }, { "epoch": 3.532513181019332, "grad_norm": 0.15321812703232093, "learning_rate": 1.9225760393376414e-05, "loss": 0.4858, "step": 2010 }, { "epoch": 3.5342706502636205, "grad_norm": 0.17834186340215688, "learning_rate": 1.9183828029558774e-05, "loss": 0.4825, "step": 2011 }, { "epoch": 3.5360281195079084, "grad_norm": 0.1617239893733656, "learning_rate": 1.914192701451993e-05, "loss": 0.484, "step": 2012 }, { "epoch": 3.5377855887521967, "grad_norm": 0.18431111025505767, "learning_rate": 1.9100057411362048e-05, "loss": 0.4887, "step": 2013 }, { "epoch": 3.539543057996485, "grad_norm": 0.1687601995586054, "learning_rate": 1.905821928314e-05, "loss": 0.4855, "step": 2014 }, { "epoch": 3.5413005272407734, "grad_norm": 0.15348896970694648, "learning_rate": 1.901641269286125e-05, "loss": 0.4824, "step": 2015 }, { "epoch": 3.5430579964850617, "grad_norm": 0.17618908926092822, "learning_rate": 1.8974637703485766e-05, "loss": 0.493, "step": 2016 }, { "epoch": 3.5448154657293496, "grad_norm": 0.13926575130557142, "learning_rate": 1.8932894377925928e-05, "loss": 0.4993, "step": 2017 }, { "epoch": 3.546572934973638, "grad_norm": 0.17645044158528927, "learning_rate": 1.8891182779046414e-05, "loss": 0.4856, "step": 2018 }, { "epoch": 3.5483304042179262, "grad_norm": 0.15712676395965597, "learning_rate": 1.884950296966417e-05, "loss": 0.478, "step": 2019 }, { "epoch": 3.5500878734622145, "grad_norm": 0.16323143009036825, "learning_rate": 1.8807855012548214e-05, "loss": 0.5044, "step": 2020 }, { "epoch": 3.5518453427065024, "grad_norm": 0.17317786956182335, "learning_rate": 1.8766238970419624e-05, "loss": 0.4921, "step": 2021 }, { "epoch": 3.5536028119507908, "grad_norm": 0.1487017616494527, "learning_rate": 1.872465490595141e-05, "loss": 0.4939, "step": 2022 }, { "epoch": 3.555360281195079, "grad_norm": 0.18483853338860268, "learning_rate": 1.868310288176841e-05, "loss": 0.4962, "step": 2023 }, { "epoch": 3.5571177504393674, "grad_norm": 0.1356115225782941, "learning_rate": 1.864158296044723e-05, "loss": 0.4918, "step": 2024 }, { "epoch": 3.5588752196836557, "grad_norm": 0.20491752785438302, "learning_rate": 1.860009520451612e-05, "loss": 0.4933, "step": 2025 }, { "epoch": 3.5606326889279436, "grad_norm": 0.14419898205696466, "learning_rate": 1.855863967645489e-05, "loss": 0.5096, "step": 2026 }, { "epoch": 3.562390158172232, "grad_norm": 0.20350554118374536, "learning_rate": 1.851721643869482e-05, "loss": 0.4977, "step": 2027 }, { "epoch": 3.5641476274165202, "grad_norm": 0.16081288884933972, "learning_rate": 1.847582555361853e-05, "loss": 0.4972, "step": 2028 }, { "epoch": 3.5659050966608086, "grad_norm": 0.187736767999281, "learning_rate": 1.843446708355999e-05, "loss": 0.4855, "step": 2029 }, { "epoch": 3.5676625659050965, "grad_norm": 0.17175487824225424, "learning_rate": 1.8393141090804287e-05, "loss": 0.4829, "step": 2030 }, { "epoch": 3.5694200351493848, "grad_norm": 0.18021635608074782, "learning_rate": 1.8351847637587617e-05, "loss": 0.4896, "step": 2031 }, { "epoch": 3.571177504393673, "grad_norm": 0.17656418409507038, "learning_rate": 1.831058678609718e-05, "loss": 0.491, "step": 2032 }, { "epoch": 3.5729349736379614, "grad_norm": 0.16357118675482982, "learning_rate": 1.826935859847105e-05, "loss": 0.4969, "step": 2033 }, { "epoch": 3.5746924428822497, "grad_norm": 0.1717318215844995, "learning_rate": 1.8228163136798184e-05, "loss": 0.4875, "step": 2034 }, { "epoch": 3.5764499121265376, "grad_norm": 0.17849852650561604, "learning_rate": 1.818700046311815e-05, "loss": 0.4935, "step": 2035 }, { "epoch": 3.578207381370826, "grad_norm": 0.16373298522287033, "learning_rate": 1.8145870639421218e-05, "loss": 0.4908, "step": 2036 }, { "epoch": 3.5799648506151143, "grad_norm": 0.18204244073829276, "learning_rate": 1.8104773727648152e-05, "loss": 0.4855, "step": 2037 }, { "epoch": 3.5817223198594026, "grad_norm": 0.16648802775692398, "learning_rate": 1.8063709789690144e-05, "loss": 0.5028, "step": 2038 }, { "epoch": 3.5834797891036905, "grad_norm": 0.175822573802549, "learning_rate": 1.802267888738877e-05, "loss": 0.4835, "step": 2039 }, { "epoch": 3.585237258347979, "grad_norm": 0.13999425768109072, "learning_rate": 1.7981681082535814e-05, "loss": 0.4956, "step": 2040 }, { "epoch": 3.586994727592267, "grad_norm": 0.16743947816176313, "learning_rate": 1.7940716436873227e-05, "loss": 0.4922, "step": 2041 }, { "epoch": 3.5887521968365554, "grad_norm": 0.13938856230903027, "learning_rate": 1.7899785012093032e-05, "loss": 0.4755, "step": 2042 }, { "epoch": 3.5905096660808438, "grad_norm": 0.1365560319293613, "learning_rate": 1.7858886869837193e-05, "loss": 0.5058, "step": 2043 }, { "epoch": 3.5922671353251316, "grad_norm": 0.14165031653284013, "learning_rate": 1.781802207169761e-05, "loss": 0.4958, "step": 2044 }, { "epoch": 3.59402460456942, "grad_norm": 0.15249359853744893, "learning_rate": 1.7777190679215923e-05, "loss": 0.4986, "step": 2045 }, { "epoch": 3.5957820738137083, "grad_norm": 0.15871682519470595, "learning_rate": 1.7736392753883443e-05, "loss": 0.4914, "step": 2046 }, { "epoch": 3.5975395430579966, "grad_norm": 0.16418271025832545, "learning_rate": 1.769562835714113e-05, "loss": 0.4854, "step": 2047 }, { "epoch": 3.5992970123022845, "grad_norm": 0.18562885396450915, "learning_rate": 1.7654897550379405e-05, "loss": 0.4838, "step": 2048 }, { "epoch": 3.601054481546573, "grad_norm": 0.17507411051083407, "learning_rate": 1.7614200394938163e-05, "loss": 0.4866, "step": 2049 }, { "epoch": 3.602811950790861, "grad_norm": 0.35857174761953703, "learning_rate": 1.7573536952106563e-05, "loss": 0.4985, "step": 2050 }, { "epoch": 3.6045694200351495, "grad_norm": 0.1609869481959847, "learning_rate": 1.7532907283123024e-05, "loss": 0.4986, "step": 2051 }, { "epoch": 3.606326889279438, "grad_norm": 0.1608906019624701, "learning_rate": 1.7492311449175082e-05, "loss": 0.489, "step": 2052 }, { "epoch": 3.608084358523726, "grad_norm": 0.14599940546701679, "learning_rate": 1.745174951139933e-05, "loss": 0.4864, "step": 2053 }, { "epoch": 3.609841827768014, "grad_norm": 0.14526929938074967, "learning_rate": 1.7411221530881323e-05, "loss": 0.4861, "step": 2054 }, { "epoch": 3.6115992970123023, "grad_norm": 0.15039767763024073, "learning_rate": 1.737072756865547e-05, "loss": 0.5212, "step": 2055 }, { "epoch": 3.6133567662565906, "grad_norm": 0.1304964500490624, "learning_rate": 1.7330267685704937e-05, "loss": 0.4853, "step": 2056 }, { "epoch": 3.6151142355008785, "grad_norm": 0.16390233277528923, "learning_rate": 1.7289841942961594e-05, "loss": 0.4842, "step": 2057 }, { "epoch": 3.616871704745167, "grad_norm": 0.1439713208580424, "learning_rate": 1.7249450401305835e-05, "loss": 0.4852, "step": 2058 }, { "epoch": 3.618629173989455, "grad_norm": 0.18015510575394816, "learning_rate": 1.7209093121566635e-05, "loss": 0.4984, "step": 2059 }, { "epoch": 3.6203866432337435, "grad_norm": 0.1524302903185474, "learning_rate": 1.7168770164521315e-05, "loss": 0.4886, "step": 2060 }, { "epoch": 3.622144112478032, "grad_norm": 0.15981742114177366, "learning_rate": 1.7128481590895515e-05, "loss": 0.4984, "step": 2061 }, { "epoch": 3.62390158172232, "grad_norm": 0.1399429857467507, "learning_rate": 1.7088227461363106e-05, "loss": 0.4954, "step": 2062 }, { "epoch": 3.625659050966608, "grad_norm": 0.16114816002293494, "learning_rate": 1.704800783654606e-05, "loss": 0.4866, "step": 2063 }, { "epoch": 3.6274165202108963, "grad_norm": 0.14369773698391544, "learning_rate": 1.7007822777014433e-05, "loss": 0.4744, "step": 2064 }, { "epoch": 3.6291739894551847, "grad_norm": 0.1545282199612366, "learning_rate": 1.6967672343286187e-05, "loss": 0.4765, "step": 2065 }, { "epoch": 3.6309314586994725, "grad_norm": 0.15392714032497634, "learning_rate": 1.692755659582715e-05, "loss": 0.4751, "step": 2066 }, { "epoch": 3.632688927943761, "grad_norm": 0.14085877498034438, "learning_rate": 1.688747559505091e-05, "loss": 0.4979, "step": 2067 }, { "epoch": 3.634446397188049, "grad_norm": 0.14236806232653648, "learning_rate": 1.6847429401318732e-05, "loss": 0.4938, "step": 2068 }, { "epoch": 3.6362038664323375, "grad_norm": 0.1361613302612382, "learning_rate": 1.6807418074939458e-05, "loss": 0.4887, "step": 2069 }, { "epoch": 3.637961335676626, "grad_norm": 0.16686744056544528, "learning_rate": 1.6767441676169424e-05, "loss": 0.4788, "step": 2070 }, { "epoch": 3.639718804920914, "grad_norm": 0.14800752522974842, "learning_rate": 1.6727500265212367e-05, "loss": 0.4964, "step": 2071 }, { "epoch": 3.641476274165202, "grad_norm": 0.1674102354188738, "learning_rate": 1.6687593902219322e-05, "loss": 0.4952, "step": 2072 }, { "epoch": 3.6432337434094904, "grad_norm": 0.14640510176143612, "learning_rate": 1.6647722647288548e-05, "loss": 0.4882, "step": 2073 }, { "epoch": 3.6449912126537787, "grad_norm": 0.17159049284814287, "learning_rate": 1.660788656046545e-05, "loss": 0.4898, "step": 2074 }, { "epoch": 3.6467486818980666, "grad_norm": 0.1513847933987227, "learning_rate": 1.6568085701742447e-05, "loss": 0.5007, "step": 2075 }, { "epoch": 3.648506151142355, "grad_norm": 0.18182936760849028, "learning_rate": 1.652832013105891e-05, "loss": 0.4858, "step": 2076 }, { "epoch": 3.650263620386643, "grad_norm": 0.17446329305673433, "learning_rate": 1.648858990830108e-05, "loss": 0.4908, "step": 2077 }, { "epoch": 3.6520210896309315, "grad_norm": 0.1553038893143316, "learning_rate": 1.6448895093301944e-05, "loss": 0.4849, "step": 2078 }, { "epoch": 3.65377855887522, "grad_norm": 0.17209025877695902, "learning_rate": 1.6409235745841178e-05, "loss": 0.4952, "step": 2079 }, { "epoch": 3.655536028119508, "grad_norm": 0.28598677093304053, "learning_rate": 1.636961192564504e-05, "loss": 0.473, "step": 2080 }, { "epoch": 3.657293497363796, "grad_norm": 0.18530852455447658, "learning_rate": 1.6330023692386294e-05, "loss": 0.4989, "step": 2081 }, { "epoch": 3.6590509666080844, "grad_norm": 0.1661119311821805, "learning_rate": 1.62904711056841e-05, "loss": 0.4936, "step": 2082 }, { "epoch": 3.6608084358523727, "grad_norm": 0.17540156134163648, "learning_rate": 1.625095422510392e-05, "loss": 0.493, "step": 2083 }, { "epoch": 3.6625659050966606, "grad_norm": 0.1637413566940118, "learning_rate": 1.6211473110157504e-05, "loss": 0.4797, "step": 2084 }, { "epoch": 3.664323374340949, "grad_norm": 0.1643753094704167, "learning_rate": 1.617202782030267e-05, "loss": 0.5004, "step": 2085 }, { "epoch": 3.6660808435852372, "grad_norm": 0.13758168099545828, "learning_rate": 1.613261841494332e-05, "loss": 0.4811, "step": 2086 }, { "epoch": 3.6678383128295255, "grad_norm": 0.1730908486568683, "learning_rate": 1.6093244953429295e-05, "loss": 0.4881, "step": 2087 }, { "epoch": 3.669595782073814, "grad_norm": 0.1614900530053359, "learning_rate": 1.6053907495056312e-05, "loss": 0.481, "step": 2088 }, { "epoch": 3.671353251318102, "grad_norm": 0.15228845661516213, "learning_rate": 1.6014606099065915e-05, "loss": 0.4901, "step": 2089 }, { "epoch": 3.67311072056239, "grad_norm": 0.14883761215454186, "learning_rate": 1.5975340824645252e-05, "loss": 0.486, "step": 2090 }, { "epoch": 3.6748681898066784, "grad_norm": 0.15969116646061765, "learning_rate": 1.5936111730927127e-05, "loss": 0.4902, "step": 2091 }, { "epoch": 3.6766256590509667, "grad_norm": 0.14344502969340883, "learning_rate": 1.5896918876989856e-05, "loss": 0.487, "step": 2092 }, { "epoch": 3.6783831282952546, "grad_norm": 0.18329646672574995, "learning_rate": 1.5857762321857143e-05, "loss": 0.4832, "step": 2093 }, { "epoch": 3.680140597539543, "grad_norm": 0.18932464321968495, "learning_rate": 1.5818642124498088e-05, "loss": 0.5017, "step": 2094 }, { "epoch": 3.6818980667838312, "grad_norm": 0.1962520686543727, "learning_rate": 1.5779558343826983e-05, "loss": 0.4803, "step": 2095 }, { "epoch": 3.6836555360281196, "grad_norm": 0.15385566763292186, "learning_rate": 1.5740511038703296e-05, "loss": 0.4942, "step": 2096 }, { "epoch": 3.685413005272408, "grad_norm": 0.1943210931152153, "learning_rate": 1.5701500267931567e-05, "loss": 0.5024, "step": 2097 }, { "epoch": 3.687170474516696, "grad_norm": 0.16911637690821507, "learning_rate": 1.5662526090261294e-05, "loss": 0.4839, "step": 2098 }, { "epoch": 3.688927943760984, "grad_norm": 0.17257552190634817, "learning_rate": 1.5623588564386914e-05, "loss": 0.492, "step": 2099 }, { "epoch": 3.6906854130052724, "grad_norm": 0.17524671841423042, "learning_rate": 1.5584687748947617e-05, "loss": 0.5004, "step": 2100 }, { "epoch": 3.6924428822495607, "grad_norm": 0.15933166526284, "learning_rate": 1.554582370252735e-05, "loss": 0.4933, "step": 2101 }, { "epoch": 3.6942003514938486, "grad_norm": 0.15457210858935272, "learning_rate": 1.550699648365464e-05, "loss": 0.4868, "step": 2102 }, { "epoch": 3.695957820738137, "grad_norm": 0.13407681389175333, "learning_rate": 1.5468206150802566e-05, "loss": 0.5052, "step": 2103 }, { "epoch": 3.6977152899824253, "grad_norm": 0.16124621237087058, "learning_rate": 1.5429452762388698e-05, "loss": 0.4995, "step": 2104 }, { "epoch": 3.6994727592267136, "grad_norm": 0.14417281429260362, "learning_rate": 1.5390736376774932e-05, "loss": 0.4891, "step": 2105 }, { "epoch": 3.701230228471002, "grad_norm": 0.1570490815280817, "learning_rate": 1.5352057052267436e-05, "loss": 0.4918, "step": 2106 }, { "epoch": 3.7029876977152902, "grad_norm": 0.14810899263678254, "learning_rate": 1.5313414847116583e-05, "loss": 0.4891, "step": 2107 }, { "epoch": 3.704745166959578, "grad_norm": 0.18271451777456302, "learning_rate": 1.5274809819516817e-05, "loss": 0.4915, "step": 2108 }, { "epoch": 3.7065026362038664, "grad_norm": 0.1439823219341696, "learning_rate": 1.5236242027606638e-05, "loss": 0.493, "step": 2109 }, { "epoch": 3.7082601054481548, "grad_norm": 0.16447206895863545, "learning_rate": 1.5197711529468434e-05, "loss": 0.4913, "step": 2110 }, { "epoch": 3.7100175746924426, "grad_norm": 0.1797440836460246, "learning_rate": 1.5159218383128433e-05, "loss": 0.502, "step": 2111 }, { "epoch": 3.711775043936731, "grad_norm": 0.18566436432347905, "learning_rate": 1.5120762646556619e-05, "loss": 0.4905, "step": 2112 }, { "epoch": 3.7135325131810193, "grad_norm": 0.1750365057204655, "learning_rate": 1.5082344377666643e-05, "loss": 0.4913, "step": 2113 }, { "epoch": 3.7152899824253076, "grad_norm": 0.1602054631151704, "learning_rate": 1.5043963634315719e-05, "loss": 0.4891, "step": 2114 }, { "epoch": 3.717047451669596, "grad_norm": 0.18740955240354182, "learning_rate": 1.5005620474304553e-05, "loss": 0.4922, "step": 2115 }, { "epoch": 3.7188049209138843, "grad_norm": 0.16952177092798246, "learning_rate": 1.4967314955377248e-05, "loss": 0.4802, "step": 2116 }, { "epoch": 3.720562390158172, "grad_norm": 0.17214751637947528, "learning_rate": 1.4929047135221226e-05, "loss": 0.4976, "step": 2117 }, { "epoch": 3.7223198594024605, "grad_norm": 0.3204127407837524, "learning_rate": 1.489081707146712e-05, "loss": 0.4914, "step": 2118 }, { "epoch": 3.724077328646749, "grad_norm": 0.18217969910503656, "learning_rate": 1.4852624821688744e-05, "loss": 0.4878, "step": 2119 }, { "epoch": 3.7258347978910367, "grad_norm": 0.1505335125261073, "learning_rate": 1.4814470443402918e-05, "loss": 0.502, "step": 2120 }, { "epoch": 3.727592267135325, "grad_norm": 0.1655623315140575, "learning_rate": 1.4776353994069449e-05, "loss": 0.4967, "step": 2121 }, { "epoch": 3.7293497363796133, "grad_norm": 0.13651858242707451, "learning_rate": 1.4738275531091017e-05, "loss": 0.484, "step": 2122 }, { "epoch": 3.7311072056239016, "grad_norm": 0.14565128723192158, "learning_rate": 1.4700235111813105e-05, "loss": 0.4864, "step": 2123 }, { "epoch": 3.73286467486819, "grad_norm": 0.14735021961326433, "learning_rate": 1.4662232793523896e-05, "loss": 0.4963, "step": 2124 }, { "epoch": 3.7346221441124783, "grad_norm": 0.14447623364695672, "learning_rate": 1.4624268633454187e-05, "loss": 0.5021, "step": 2125 }, { "epoch": 3.736379613356766, "grad_norm": 0.1547119616816172, "learning_rate": 1.4586342688777326e-05, "loss": 0.5053, "step": 2126 }, { "epoch": 3.7381370826010545, "grad_norm": 0.14366360993150035, "learning_rate": 1.4548455016609095e-05, "loss": 0.4897, "step": 2127 }, { "epoch": 3.739894551845343, "grad_norm": 0.16939515341929778, "learning_rate": 1.451060567400763e-05, "loss": 0.4871, "step": 2128 }, { "epoch": 3.7416520210896307, "grad_norm": 0.13248350305669884, "learning_rate": 1.4472794717973392e-05, "loss": 0.5022, "step": 2129 }, { "epoch": 3.743409490333919, "grad_norm": 0.1834136873988127, "learning_rate": 1.4435022205448971e-05, "loss": 0.4836, "step": 2130 }, { "epoch": 3.7451669595782073, "grad_norm": 0.14478858679433831, "learning_rate": 1.4397288193319096e-05, "loss": 0.4931, "step": 2131 }, { "epoch": 3.7469244288224957, "grad_norm": 0.15340046885797332, "learning_rate": 1.4359592738410508e-05, "loss": 0.5005, "step": 2132 }, { "epoch": 3.748681898066784, "grad_norm": 0.16983513502314002, "learning_rate": 1.432193589749188e-05, "loss": 0.5076, "step": 2133 }, { "epoch": 3.7504393673110723, "grad_norm": 0.1352195072563255, "learning_rate": 1.4284317727273732e-05, "loss": 0.5064, "step": 2134 }, { "epoch": 3.75219683655536, "grad_norm": 0.1581253731418609, "learning_rate": 1.424673828440835e-05, "loss": 0.4914, "step": 2135 }, { "epoch": 3.7539543057996485, "grad_norm": 0.14786658104970954, "learning_rate": 1.4209197625489694e-05, "loss": 0.4922, "step": 2136 }, { "epoch": 3.755711775043937, "grad_norm": 0.1506744793711129, "learning_rate": 1.4171695807053319e-05, "loss": 0.4928, "step": 2137 }, { "epoch": 3.7574692442882247, "grad_norm": 0.14470500630138367, "learning_rate": 1.4134232885576275e-05, "loss": 0.4883, "step": 2138 }, { "epoch": 3.759226713532513, "grad_norm": 0.16502482164914625, "learning_rate": 1.4096808917477068e-05, "loss": 0.4947, "step": 2139 }, { "epoch": 3.7609841827768014, "grad_norm": 0.18301713394438013, "learning_rate": 1.4059423959115504e-05, "loss": 0.4932, "step": 2140 }, { "epoch": 3.7627416520210897, "grad_norm": 0.1694157859994962, "learning_rate": 1.402207806679266e-05, "loss": 0.4868, "step": 2141 }, { "epoch": 3.764499121265378, "grad_norm": 0.17759813153186924, "learning_rate": 1.3984771296750767e-05, "loss": 0.4848, "step": 2142 }, { "epoch": 3.7662565905096663, "grad_norm": 0.1583794117649617, "learning_rate": 1.3947503705173136e-05, "loss": 0.5003, "step": 2143 }, { "epoch": 3.768014059753954, "grad_norm": 0.181299968549736, "learning_rate": 1.3910275348184131e-05, "loss": 0.4884, "step": 2144 }, { "epoch": 3.7697715289982425, "grad_norm": 0.16365878986436697, "learning_rate": 1.387308628184894e-05, "loss": 0.4887, "step": 2145 }, { "epoch": 3.771528998242531, "grad_norm": 0.17963002150973162, "learning_rate": 1.383593656217364e-05, "loss": 0.4866, "step": 2146 }, { "epoch": 3.7732864674868187, "grad_norm": 0.1666123293658815, "learning_rate": 1.3798826245105036e-05, "loss": 0.4942, "step": 2147 }, { "epoch": 3.775043936731107, "grad_norm": 0.1481062062134611, "learning_rate": 1.3761755386530581e-05, "loss": 0.4942, "step": 2148 }, { "epoch": 3.7768014059753954, "grad_norm": 0.18313386311627192, "learning_rate": 1.372472404227835e-05, "loss": 0.4888, "step": 2149 }, { "epoch": 3.7785588752196837, "grad_norm": 0.1462695849998175, "learning_rate": 1.3687732268116855e-05, "loss": 0.4879, "step": 2150 }, { "epoch": 3.780316344463972, "grad_norm": 0.17765877334235777, "learning_rate": 1.3650780119755051e-05, "loss": 0.4882, "step": 2151 }, { "epoch": 3.7820738137082603, "grad_norm": 0.16916617080667046, "learning_rate": 1.36138676528422e-05, "loss": 0.4776, "step": 2152 }, { "epoch": 3.7838312829525482, "grad_norm": 0.14360303075064274, "learning_rate": 1.3576994922967792e-05, "loss": 0.5033, "step": 2153 }, { "epoch": 3.7855887521968365, "grad_norm": 0.1474205705996667, "learning_rate": 1.354016198566153e-05, "loss": 0.5023, "step": 2154 }, { "epoch": 3.787346221441125, "grad_norm": 0.13503031311952557, "learning_rate": 1.3503368896393126e-05, "loss": 0.4992, "step": 2155 }, { "epoch": 3.7891036906854128, "grad_norm": 0.12487665163995319, "learning_rate": 1.3466615710572328e-05, "loss": 0.4875, "step": 2156 }, { "epoch": 3.790861159929701, "grad_norm": 0.1555292789833239, "learning_rate": 1.3429902483548745e-05, "loss": 0.4912, "step": 2157 }, { "epoch": 3.7926186291739894, "grad_norm": 0.15676309957051937, "learning_rate": 1.3393229270611828e-05, "loss": 0.4907, "step": 2158 }, { "epoch": 3.7943760984182777, "grad_norm": 0.13557217051442783, "learning_rate": 1.3356596126990802e-05, "loss": 0.4952, "step": 2159 }, { "epoch": 3.796133567662566, "grad_norm": 0.1410018797144549, "learning_rate": 1.3320003107854507e-05, "loss": 0.5002, "step": 2160 }, { "epoch": 3.7978910369068544, "grad_norm": 0.15540920842318917, "learning_rate": 1.3283450268311361e-05, "loss": 0.4918, "step": 2161 }, { "epoch": 3.7996485061511422, "grad_norm": 0.1184160169193259, "learning_rate": 1.3246937663409294e-05, "loss": 0.4936, "step": 2162 }, { "epoch": 3.8014059753954306, "grad_norm": 0.14208758828415569, "learning_rate": 1.32104653481356e-05, "loss": 0.4875, "step": 2163 }, { "epoch": 3.803163444639719, "grad_norm": 0.13163507076079964, "learning_rate": 1.3174033377416966e-05, "loss": 0.5035, "step": 2164 }, { "epoch": 3.8049209138840068, "grad_norm": 0.1462321579425802, "learning_rate": 1.3137641806119264e-05, "loss": 0.4882, "step": 2165 }, { "epoch": 3.806678383128295, "grad_norm": 0.1424989328563288, "learning_rate": 1.3101290689047539e-05, "loss": 0.504, "step": 2166 }, { "epoch": 3.8084358523725834, "grad_norm": 0.16321181528679185, "learning_rate": 1.3064980080945922e-05, "loss": 0.4917, "step": 2167 }, { "epoch": 3.8101933216168717, "grad_norm": 0.14712861537028704, "learning_rate": 1.3028710036497527e-05, "loss": 0.5043, "step": 2168 }, { "epoch": 3.81195079086116, "grad_norm": 0.13549754765985894, "learning_rate": 1.2992480610324388e-05, "loss": 0.4838, "step": 2169 }, { "epoch": 3.8137082601054484, "grad_norm": 0.1551358073150057, "learning_rate": 1.2956291856987369e-05, "loss": 0.4885, "step": 2170 }, { "epoch": 3.8154657293497363, "grad_norm": 0.13684182048668725, "learning_rate": 1.2920143830986072e-05, "loss": 0.4868, "step": 2171 }, { "epoch": 3.8172231985940246, "grad_norm": 0.15542314893307357, "learning_rate": 1.2884036586758777e-05, "loss": 0.5089, "step": 2172 }, { "epoch": 3.818980667838313, "grad_norm": 0.14063024060657972, "learning_rate": 1.284797017868233e-05, "loss": 0.4901, "step": 2173 }, { "epoch": 3.820738137082601, "grad_norm": 0.9064389945772238, "learning_rate": 1.2811944661072104e-05, "loss": 0.4935, "step": 2174 }, { "epoch": 3.822495606326889, "grad_norm": 0.13452757024011205, "learning_rate": 1.277596008818188e-05, "loss": 0.4953, "step": 2175 }, { "epoch": 3.8242530755711774, "grad_norm": 0.1387867875685744, "learning_rate": 1.2740016514203762e-05, "loss": 0.4832, "step": 2176 }, { "epoch": 3.8260105448154658, "grad_norm": 0.1404426367010001, "learning_rate": 1.2704113993268129e-05, "loss": 0.5037, "step": 2177 }, { "epoch": 3.827768014059754, "grad_norm": 0.13133178004942003, "learning_rate": 1.2668252579443525e-05, "loss": 0.4861, "step": 2178 }, { "epoch": 3.8295254833040424, "grad_norm": 0.1564498827452639, "learning_rate": 1.2632432326736597e-05, "loss": 0.488, "step": 2179 }, { "epoch": 3.8312829525483303, "grad_norm": 0.1367406278460107, "learning_rate": 1.2596653289091991e-05, "loss": 0.4914, "step": 2180 }, { "epoch": 3.8330404217926186, "grad_norm": 0.13348726785118115, "learning_rate": 1.2560915520392296e-05, "loss": 0.4887, "step": 2181 }, { "epoch": 3.834797891036907, "grad_norm": 0.1458972333126416, "learning_rate": 1.2525219074457949e-05, "loss": 0.4969, "step": 2182 }, { "epoch": 3.836555360281195, "grad_norm": 0.12070718955258146, "learning_rate": 1.2489564005047128e-05, "loss": 0.498, "step": 2183 }, { "epoch": 3.838312829525483, "grad_norm": 0.17212629533984994, "learning_rate": 1.2453950365855757e-05, "loss": 0.4781, "step": 2184 }, { "epoch": 3.8400702987697715, "grad_norm": 0.1264595029021295, "learning_rate": 1.2418378210517323e-05, "loss": 0.5042, "step": 2185 }, { "epoch": 3.84182776801406, "grad_norm": 0.14852507241608418, "learning_rate": 1.2382847592602847e-05, "loss": 0.4856, "step": 2186 }, { "epoch": 3.843585237258348, "grad_norm": 0.16716993709040218, "learning_rate": 1.2347358565620789e-05, "loss": 0.4974, "step": 2187 }, { "epoch": 3.8453427065026364, "grad_norm": 0.140341537583772, "learning_rate": 1.2311911183016991e-05, "loss": 0.503, "step": 2188 }, { "epoch": 3.8471001757469243, "grad_norm": 0.14680783711875137, "learning_rate": 1.2276505498174571e-05, "loss": 0.4899, "step": 2189 }, { "epoch": 3.8488576449912126, "grad_norm": 0.12759175919351237, "learning_rate": 1.224114156441385e-05, "loss": 0.4966, "step": 2190 }, { "epoch": 3.850615114235501, "grad_norm": 0.13109490679375832, "learning_rate": 1.2205819434992265e-05, "loss": 0.4855, "step": 2191 }, { "epoch": 3.852372583479789, "grad_norm": 0.14634813385487555, "learning_rate": 1.2170539163104315e-05, "loss": 0.4841, "step": 2192 }, { "epoch": 3.854130052724077, "grad_norm": 0.13202374873561412, "learning_rate": 1.2135300801881433e-05, "loss": 0.4818, "step": 2193 }, { "epoch": 3.8558875219683655, "grad_norm": 0.14710569992481123, "learning_rate": 1.2100104404391981e-05, "loss": 0.4728, "step": 2194 }, { "epoch": 3.857644991212654, "grad_norm": 0.16602940811908667, "learning_rate": 1.2064950023641088e-05, "loss": 0.4973, "step": 2195 }, { "epoch": 3.859402460456942, "grad_norm": 0.14227589060758924, "learning_rate": 1.2029837712570611e-05, "loss": 0.5027, "step": 2196 }, { "epoch": 3.8611599297012305, "grad_norm": 0.1404380814549682, "learning_rate": 1.199476752405906e-05, "loss": 0.4895, "step": 2197 }, { "epoch": 3.8629173989455183, "grad_norm": 0.12755838949732584, "learning_rate": 1.195973951092149e-05, "loss": 0.4865, "step": 2198 }, { "epoch": 3.8646748681898067, "grad_norm": 0.137289342264857, "learning_rate": 1.1924753725909483e-05, "loss": 0.4917, "step": 2199 }, { "epoch": 3.866432337434095, "grad_norm": 0.12093939331701889, "learning_rate": 1.1889810221710998e-05, "loss": 0.4936, "step": 2200 }, { "epoch": 3.868189806678383, "grad_norm": 0.12297695926012836, "learning_rate": 1.1854909050950294e-05, "loss": 0.484, "step": 2201 }, { "epoch": 3.869947275922671, "grad_norm": 0.1424060543011497, "learning_rate": 1.1820050266187919e-05, "loss": 0.4939, "step": 2202 }, { "epoch": 3.8717047451669595, "grad_norm": 0.1972863888665481, "learning_rate": 1.1785233919920556e-05, "loss": 0.4989, "step": 2203 }, { "epoch": 3.873462214411248, "grad_norm": 0.1435469747236554, "learning_rate": 1.1750460064581022e-05, "loss": 0.4956, "step": 2204 }, { "epoch": 3.875219683655536, "grad_norm": 0.1370250635512526, "learning_rate": 1.1715728752538103e-05, "loss": 0.4785, "step": 2205 }, { "epoch": 3.8769771528998245, "grad_norm": 0.12811316759046415, "learning_rate": 1.1681040036096522e-05, "loss": 0.4979, "step": 2206 }, { "epoch": 3.8787346221441124, "grad_norm": 0.14027426336404636, "learning_rate": 1.1646393967496867e-05, "loss": 0.4906, "step": 2207 }, { "epoch": 3.8804920913884007, "grad_norm": 0.14260860307606826, "learning_rate": 1.1611790598915471e-05, "loss": 0.4839, "step": 2208 }, { "epoch": 3.882249560632689, "grad_norm": 0.13124974965815414, "learning_rate": 1.1577229982464413e-05, "loss": 0.4873, "step": 2209 }, { "epoch": 3.884007029876977, "grad_norm": 0.1234252934985588, "learning_rate": 1.1542712170191338e-05, "loss": 0.4897, "step": 2210 }, { "epoch": 3.885764499121265, "grad_norm": 0.12859508823119917, "learning_rate": 1.1508237214079467e-05, "loss": 0.4852, "step": 2211 }, { "epoch": 3.8875219683655535, "grad_norm": 0.12495921786296614, "learning_rate": 1.1473805166047432e-05, "loss": 0.4899, "step": 2212 }, { "epoch": 3.889279437609842, "grad_norm": 0.13836047924805828, "learning_rate": 1.1439416077949272e-05, "loss": 0.4882, "step": 2213 }, { "epoch": 3.89103690685413, "grad_norm": 0.12168906735826247, "learning_rate": 1.1405070001574363e-05, "loss": 0.4999, "step": 2214 }, { "epoch": 3.8927943760984185, "grad_norm": 0.12229784179962627, "learning_rate": 1.1370766988647257e-05, "loss": 0.4989, "step": 2215 }, { "epoch": 3.8945518453427064, "grad_norm": 0.13066039461796597, "learning_rate": 1.133650709082768e-05, "loss": 0.4902, "step": 2216 }, { "epoch": 3.8963093145869947, "grad_norm": 0.13204369130951804, "learning_rate": 1.1302290359710408e-05, "loss": 0.487, "step": 2217 }, { "epoch": 3.898066783831283, "grad_norm": 0.12667702804092054, "learning_rate": 1.126811684682522e-05, "loss": 0.4974, "step": 2218 }, { "epoch": 3.899824253075571, "grad_norm": 0.14635013194470714, "learning_rate": 1.123398660363682e-05, "loss": 0.4945, "step": 2219 }, { "epoch": 3.9015817223198592, "grad_norm": 0.1352968426675649, "learning_rate": 1.1199899681544735e-05, "loss": 0.487, "step": 2220 }, { "epoch": 3.9033391915641475, "grad_norm": 0.1298372749251057, "learning_rate": 1.1165856131883247e-05, "loss": 0.5007, "step": 2221 }, { "epoch": 3.905096660808436, "grad_norm": 0.16359479947702874, "learning_rate": 1.1131856005921335e-05, "loss": 0.4908, "step": 2222 }, { "epoch": 3.906854130052724, "grad_norm": 0.128176563868422, "learning_rate": 1.1097899354862568e-05, "loss": 0.49, "step": 2223 }, { "epoch": 3.9086115992970125, "grad_norm": 0.1299065344684281, "learning_rate": 1.1063986229845044e-05, "loss": 0.4846, "step": 2224 }, { "epoch": 3.9103690685413004, "grad_norm": 0.14823353516805682, "learning_rate": 1.103011668194133e-05, "loss": 0.4948, "step": 2225 }, { "epoch": 3.9121265377855887, "grad_norm": 0.16316208702590312, "learning_rate": 1.099629076215834e-05, "loss": 0.4962, "step": 2226 }, { "epoch": 3.913884007029877, "grad_norm": 0.14238123350743656, "learning_rate": 1.0962508521437307e-05, "loss": 0.481, "step": 2227 }, { "epoch": 3.9156414762741654, "grad_norm": 0.136683416753211, "learning_rate": 1.0928770010653658e-05, "loss": 0.4938, "step": 2228 }, { "epoch": 3.9173989455184532, "grad_norm": 0.1644138442237322, "learning_rate": 1.0895075280617004e-05, "loss": 0.488, "step": 2229 }, { "epoch": 3.9191564147627416, "grad_norm": 0.12904784001598904, "learning_rate": 1.0861424382070994e-05, "loss": 0.4862, "step": 2230 }, { "epoch": 3.92091388400703, "grad_norm": 0.18501529430798014, "learning_rate": 1.0827817365693263e-05, "loss": 0.4892, "step": 2231 }, { "epoch": 3.922671353251318, "grad_norm": 0.14852383391876592, "learning_rate": 1.0794254282095379e-05, "loss": 0.5088, "step": 2232 }, { "epoch": 3.9244288224956065, "grad_norm": 0.148307743551583, "learning_rate": 1.0760735181822736e-05, "loss": 0.4852, "step": 2233 }, { "epoch": 3.9261862917398944, "grad_norm": 0.15508688697417394, "learning_rate": 1.072726011535449e-05, "loss": 0.491, "step": 2234 }, { "epoch": 3.9279437609841827, "grad_norm": 0.12594298696295067, "learning_rate": 1.0693829133103493e-05, "loss": 0.5007, "step": 2235 }, { "epoch": 3.929701230228471, "grad_norm": 0.1621040444882277, "learning_rate": 1.0660442285416197e-05, "loss": 0.5032, "step": 2236 }, { "epoch": 3.9314586994727594, "grad_norm": 0.13777735152859266, "learning_rate": 1.0627099622572588e-05, "loss": 0.5031, "step": 2237 }, { "epoch": 3.9332161687170473, "grad_norm": 0.125638017050677, "learning_rate": 1.0593801194786107e-05, "loss": 0.4911, "step": 2238 }, { "epoch": 3.9349736379613356, "grad_norm": 0.1553404823144308, "learning_rate": 1.0560547052203605e-05, "loss": 0.4884, "step": 2239 }, { "epoch": 3.936731107205624, "grad_norm": 0.1359992413897232, "learning_rate": 1.0527337244905205e-05, "loss": 0.4952, "step": 2240 }, { "epoch": 3.9384885764499122, "grad_norm": 0.12112392177114235, "learning_rate": 1.0494171822904286e-05, "loss": 0.4927, "step": 2241 }, { "epoch": 3.9402460456942006, "grad_norm": 0.13996526566040313, "learning_rate": 1.0461050836147365e-05, "loss": 0.4835, "step": 2242 }, { "epoch": 3.9420035149384884, "grad_norm": 0.13550962228231614, "learning_rate": 1.0427974334514053e-05, "loss": 0.5141, "step": 2243 }, { "epoch": 3.9437609841827768, "grad_norm": 0.1268943733072131, "learning_rate": 1.0394942367816965e-05, "loss": 0.493, "step": 2244 }, { "epoch": 3.945518453427065, "grad_norm": 0.11131802333218713, "learning_rate": 1.0361954985801646e-05, "loss": 0.4865, "step": 2245 }, { "epoch": 3.9472759226713534, "grad_norm": 0.12819144474456107, "learning_rate": 1.0329012238146495e-05, "loss": 0.4978, "step": 2246 }, { "epoch": 3.9490333919156413, "grad_norm": 0.11777912127831897, "learning_rate": 1.0296114174462693e-05, "loss": 0.4849, "step": 2247 }, { "epoch": 3.9507908611599296, "grad_norm": 0.12184905198971545, "learning_rate": 1.026326084429412e-05, "loss": 0.4968, "step": 2248 }, { "epoch": 3.952548330404218, "grad_norm": 0.12756267003614177, "learning_rate": 1.0230452297117321e-05, "loss": 0.4889, "step": 2249 }, { "epoch": 3.9543057996485063, "grad_norm": 0.11306496247488806, "learning_rate": 1.0197688582341363e-05, "loss": 0.4997, "step": 2250 }, { "epoch": 3.9560632688927946, "grad_norm": 0.12357276233391941, "learning_rate": 1.0164969749307812e-05, "loss": 0.4862, "step": 2251 }, { "epoch": 3.9578207381370825, "grad_norm": 0.13526931606499068, "learning_rate": 1.0132295847290634e-05, "loss": 0.4854, "step": 2252 }, { "epoch": 3.959578207381371, "grad_norm": 0.12508345824346487, "learning_rate": 1.0099666925496123e-05, "loss": 0.488, "step": 2253 }, { "epoch": 3.961335676625659, "grad_norm": 0.12714598652690015, "learning_rate": 1.0067083033062875e-05, "loss": 0.4973, "step": 2254 }, { "epoch": 3.9630931458699474, "grad_norm": 0.13191210985863158, "learning_rate": 1.0034544219061635e-05, "loss": 0.496, "step": 2255 }, { "epoch": 3.9648506151142353, "grad_norm": 0.10856721053828865, "learning_rate": 1.0002050532495252e-05, "loss": 0.4807, "step": 2256 }, { "epoch": 3.9666080843585236, "grad_norm": 0.14398824871249627, "learning_rate": 9.969602022298641e-06, "loss": 0.4945, "step": 2257 }, { "epoch": 3.968365553602812, "grad_norm": 0.11897140817920193, "learning_rate": 9.937198737338658e-06, "loss": 0.4901, "step": 2258 }, { "epoch": 3.9701230228471003, "grad_norm": 0.13195977419506522, "learning_rate": 9.904840726414088e-06, "loss": 0.484, "step": 2259 }, { "epoch": 3.9718804920913886, "grad_norm": 0.1315846514014655, "learning_rate": 9.872528038255505e-06, "loss": 0.5043, "step": 2260 }, { "epoch": 3.9736379613356765, "grad_norm": 0.12888021198703467, "learning_rate": 9.840260721525223e-06, "loss": 0.4951, "step": 2261 }, { "epoch": 3.975395430579965, "grad_norm": 0.1298508543723959, "learning_rate": 9.80803882481725e-06, "loss": 0.4921, "step": 2262 }, { "epoch": 3.977152899824253, "grad_norm": 0.13797997197371986, "learning_rate": 9.775862396657159e-06, "loss": 0.4898, "step": 2263 }, { "epoch": 3.9789103690685415, "grad_norm": 0.15177119361171149, "learning_rate": 9.743731485502095e-06, "loss": 0.4824, "step": 2264 }, { "epoch": 3.9806678383128293, "grad_norm": 0.133884394893542, "learning_rate": 9.711646139740619e-06, "loss": 0.499, "step": 2265 }, { "epoch": 3.9824253075571177, "grad_norm": 0.15515154907552323, "learning_rate": 9.679606407692676e-06, "loss": 0.492, "step": 2266 }, { "epoch": 3.984182776801406, "grad_norm": 0.13551693713796487, "learning_rate": 9.647612337609549e-06, "loss": 0.4921, "step": 2267 }, { "epoch": 3.9859402460456943, "grad_norm": 0.15331951512695122, "learning_rate": 9.615663977673679e-06, "loss": 0.4982, "step": 2268 }, { "epoch": 3.9876977152899826, "grad_norm": 0.16462894511141346, "learning_rate": 9.583761375998763e-06, "loss": 0.4878, "step": 2269 }, { "epoch": 3.9894551845342705, "grad_norm": 0.12383937598193932, "learning_rate": 9.551904580629534e-06, "loss": 0.495, "step": 2270 }, { "epoch": 3.991212653778559, "grad_norm": 0.14140451160672698, "learning_rate": 9.520093639541739e-06, "loss": 0.4864, "step": 2271 }, { "epoch": 3.992970123022847, "grad_norm": 0.14364250472035872, "learning_rate": 9.488328600642086e-06, "loss": 0.4841, "step": 2272 }, { "epoch": 3.9947275922671355, "grad_norm": 0.13295835362093228, "learning_rate": 9.45660951176814e-06, "loss": 0.4902, "step": 2273 }, { "epoch": 3.9964850615114234, "grad_norm": 0.14636125316870024, "learning_rate": 9.424936420688295e-06, "loss": 0.4948, "step": 2274 }, { "epoch": 3.9982425307557117, "grad_norm": 0.12874124643677284, "learning_rate": 9.393309375101642e-06, "loss": 0.5021, "step": 2275 }, { "epoch": 4.0, "grad_norm": 0.19147865489659915, "learning_rate": 9.361728422637943e-06, "loss": 0.4703, "step": 2276 }, { "epoch": 4.001757469244288, "grad_norm": 0.14943733085868469, "learning_rate": 9.33019361085754e-06, "loss": 0.4758, "step": 2277 }, { "epoch": 4.003514938488577, "grad_norm": 0.17441475773974444, "learning_rate": 9.298704987251295e-06, "loss": 0.4782, "step": 2278 }, { "epoch": 4.005272407732865, "grad_norm": 0.14876529777705355, "learning_rate": 9.267262599240499e-06, "loss": 0.47, "step": 2279 }, { "epoch": 4.007029876977153, "grad_norm": 0.176804199570114, "learning_rate": 9.235866494176822e-06, "loss": 0.4788, "step": 2280 }, { "epoch": 4.008787346221441, "grad_norm": 0.19543088074213355, "learning_rate": 9.20451671934223e-06, "loss": 0.4839, "step": 2281 }, { "epoch": 4.010544815465729, "grad_norm": 0.1366112503720805, "learning_rate": 9.17321332194892e-06, "loss": 0.4718, "step": 2282 }, { "epoch": 4.012302284710017, "grad_norm": 0.20596846038365565, "learning_rate": 9.141956349139227e-06, "loss": 0.4732, "step": 2283 }, { "epoch": 4.014059753954306, "grad_norm": 0.17955030431698482, "learning_rate": 9.110745847985614e-06, "loss": 0.477, "step": 2284 }, { "epoch": 4.015817223198594, "grad_norm": 0.1534533742739975, "learning_rate": 9.07958186549052e-06, "loss": 0.4786, "step": 2285 }, { "epoch": 4.017574692442882, "grad_norm": 0.19765663027871982, "learning_rate": 9.048464448586341e-06, "loss": 0.4703, "step": 2286 }, { "epoch": 4.019332161687171, "grad_norm": 0.14736044091225345, "learning_rate": 9.017393644135341e-06, "loss": 0.474, "step": 2287 }, { "epoch": 4.021089630931459, "grad_norm": 0.16494565202338923, "learning_rate": 8.986369498929588e-06, "loss": 0.4693, "step": 2288 }, { "epoch": 4.022847100175747, "grad_norm": 0.18532120230109472, "learning_rate": 8.955392059690893e-06, "loss": 0.4681, "step": 2289 }, { "epoch": 4.024604569420035, "grad_norm": 0.15127775711817842, "learning_rate": 8.924461373070707e-06, "loss": 0.4697, "step": 2290 }, { "epoch": 4.026362038664323, "grad_norm": 0.4141822079953444, "learning_rate": 8.893577485650095e-06, "loss": 0.4738, "step": 2291 }, { "epoch": 4.028119507908611, "grad_norm": 0.16682323056837028, "learning_rate": 8.86274044393963e-06, "loss": 0.475, "step": 2292 }, { "epoch": 4.0298769771529, "grad_norm": 0.15414312892656062, "learning_rate": 8.831950294379332e-06, "loss": 0.4667, "step": 2293 }, { "epoch": 4.031634446397188, "grad_norm": 0.16125162550379948, "learning_rate": 8.801207083338625e-06, "loss": 0.4718, "step": 2294 }, { "epoch": 4.033391915641476, "grad_norm": 0.14860947060234153, "learning_rate": 8.770510857116221e-06, "loss": 0.4708, "step": 2295 }, { "epoch": 4.035149384885765, "grad_norm": 0.12444946928358876, "learning_rate": 8.739861661940083e-06, "loss": 0.48, "step": 2296 }, { "epoch": 4.036906854130053, "grad_norm": 0.13055339133034852, "learning_rate": 8.709259543967348e-06, "loss": 0.4711, "step": 2297 }, { "epoch": 4.038664323374341, "grad_norm": 0.13041763335691503, "learning_rate": 8.678704549284247e-06, "loss": 0.4693, "step": 2298 }, { "epoch": 4.040421792618629, "grad_norm": 0.12626951188126526, "learning_rate": 8.648196723906053e-06, "loss": 0.4734, "step": 2299 }, { "epoch": 4.042179261862917, "grad_norm": 0.1720687801678759, "learning_rate": 8.617736113777e-06, "loss": 0.4705, "step": 2300 }, { "epoch": 4.043936731107205, "grad_norm": 0.15864649760324617, "learning_rate": 8.587322764770212e-06, "loss": 0.4804, "step": 2301 }, { "epoch": 4.045694200351494, "grad_norm": 0.12865176030885972, "learning_rate": 8.556956722687646e-06, "loss": 0.4725, "step": 2302 }, { "epoch": 4.047451669595782, "grad_norm": 0.13516399683142186, "learning_rate": 8.526638033260007e-06, "loss": 0.4757, "step": 2303 }, { "epoch": 4.04920913884007, "grad_norm": 0.11759011116462831, "learning_rate": 8.496366742146701e-06, "loss": 0.4794, "step": 2304 }, { "epoch": 4.050966608084359, "grad_norm": 0.11066508864911202, "learning_rate": 8.466142894935752e-06, "loss": 0.472, "step": 2305 }, { "epoch": 4.052724077328647, "grad_norm": 0.1252576041039245, "learning_rate": 8.435966537143715e-06, "loss": 0.4723, "step": 2306 }, { "epoch": 4.054481546572935, "grad_norm": 0.13060692308244917, "learning_rate": 8.405837714215645e-06, "loss": 0.4823, "step": 2307 }, { "epoch": 4.056239015817223, "grad_norm": 0.12092992944432372, "learning_rate": 8.375756471524993e-06, "loss": 0.4678, "step": 2308 }, { "epoch": 4.057996485061511, "grad_norm": 0.1173146453180396, "learning_rate": 8.345722854373588e-06, "loss": 0.4684, "step": 2309 }, { "epoch": 4.059753954305799, "grad_norm": 0.12993269240158514, "learning_rate": 8.315736907991514e-06, "loss": 0.4785, "step": 2310 }, { "epoch": 4.061511423550088, "grad_norm": 0.1355573559345709, "learning_rate": 8.285798677537044e-06, "loss": 0.4733, "step": 2311 }, { "epoch": 4.063268892794376, "grad_norm": 0.10904718558244252, "learning_rate": 8.255908208096622e-06, "loss": 0.4745, "step": 2312 }, { "epoch": 4.065026362038664, "grad_norm": 0.12669428838925348, "learning_rate": 8.226065544684747e-06, "loss": 0.4716, "step": 2313 }, { "epoch": 4.066783831282953, "grad_norm": 0.14332578941352137, "learning_rate": 8.196270732243948e-06, "loss": 0.4768, "step": 2314 }, { "epoch": 4.068541300527241, "grad_norm": 0.10849621773231566, "learning_rate": 8.166523815644663e-06, "loss": 0.4749, "step": 2315 }, { "epoch": 4.070298769771529, "grad_norm": 0.14020040894442473, "learning_rate": 8.136824839685213e-06, "loss": 0.4797, "step": 2316 }, { "epoch": 4.072056239015817, "grad_norm": 0.12274179810487063, "learning_rate": 8.10717384909172e-06, "loss": 0.4806, "step": 2317 }, { "epoch": 4.073813708260105, "grad_norm": 0.11793002739865899, "learning_rate": 8.077570888518029e-06, "loss": 0.4695, "step": 2318 }, { "epoch": 4.0755711775043935, "grad_norm": 0.12488385672943975, "learning_rate": 8.048016002545682e-06, "loss": 0.4728, "step": 2319 }, { "epoch": 4.077328646748682, "grad_norm": 0.12342813286581357, "learning_rate": 8.018509235683796e-06, "loss": 0.472, "step": 2320 }, { "epoch": 4.07908611599297, "grad_norm": 0.12457338275689635, "learning_rate": 7.98905063236903e-06, "loss": 0.4741, "step": 2321 }, { "epoch": 4.080843585237258, "grad_norm": 0.1283763659957353, "learning_rate": 7.95964023696551e-06, "loss": 0.476, "step": 2322 }, { "epoch": 4.082601054481547, "grad_norm": 0.14134827884915577, "learning_rate": 7.930278093764739e-06, "loss": 0.478, "step": 2323 }, { "epoch": 4.084358523725835, "grad_norm": 0.12298089685881484, "learning_rate": 7.900964246985601e-06, "loss": 0.475, "step": 2324 }, { "epoch": 4.086115992970123, "grad_norm": 0.12645382229251265, "learning_rate": 7.871698740774208e-06, "loss": 0.4927, "step": 2325 }, { "epoch": 4.087873462214411, "grad_norm": 0.13090494076975628, "learning_rate": 7.842481619203886e-06, "loss": 0.4607, "step": 2326 }, { "epoch": 4.089630931458699, "grad_norm": 0.12049699739268067, "learning_rate": 7.813312926275087e-06, "loss": 0.4707, "step": 2327 }, { "epoch": 4.0913884007029875, "grad_norm": 0.11849860519105619, "learning_rate": 7.784192705915333e-06, "loss": 0.4715, "step": 2328 }, { "epoch": 4.093145869947276, "grad_norm": 0.11963821073810288, "learning_rate": 7.755121001979162e-06, "loss": 0.468, "step": 2329 }, { "epoch": 4.094903339191564, "grad_norm": 0.1418982979476303, "learning_rate": 7.726097858248027e-06, "loss": 0.4826, "step": 2330 }, { "epoch": 4.0966608084358525, "grad_norm": 0.12725522882932883, "learning_rate": 7.697123318430262e-06, "loss": 0.4683, "step": 2331 }, { "epoch": 4.098418277680141, "grad_norm": 0.11404951215408607, "learning_rate": 7.668197426160998e-06, "loss": 0.4631, "step": 2332 }, { "epoch": 4.100175746924429, "grad_norm": 0.1295799566928521, "learning_rate": 7.639320225002106e-06, "loss": 0.4765, "step": 2333 }, { "epoch": 4.101933216168717, "grad_norm": 0.14190699484078093, "learning_rate": 7.610491758442129e-06, "loss": 0.483, "step": 2334 }, { "epoch": 4.103690685413005, "grad_norm": 0.12583799855090425, "learning_rate": 7.5817120698962145e-06, "loss": 0.488, "step": 2335 }, { "epoch": 4.105448154657293, "grad_norm": 0.1401407559617577, "learning_rate": 7.552981202706053e-06, "loss": 0.4774, "step": 2336 }, { "epoch": 4.1072056239015815, "grad_norm": 0.14589605377211337, "learning_rate": 7.524299200139818e-06, "loss": 0.4768, "step": 2337 }, { "epoch": 4.10896309314587, "grad_norm": 0.1353006834293258, "learning_rate": 7.49566610539207e-06, "loss": 0.4798, "step": 2338 }, { "epoch": 4.110720562390158, "grad_norm": 0.12322184223340113, "learning_rate": 7.467081961583753e-06, "loss": 0.4913, "step": 2339 }, { "epoch": 4.1124780316344465, "grad_norm": 0.13097923356987184, "learning_rate": 7.438546811762064e-06, "loss": 0.4688, "step": 2340 }, { "epoch": 4.114235500878735, "grad_norm": 0.11979393851903317, "learning_rate": 7.410060698900423e-06, "loss": 0.4795, "step": 2341 }, { "epoch": 4.115992970123023, "grad_norm": 0.12938753337740416, "learning_rate": 7.381623665898398e-06, "loss": 0.4855, "step": 2342 }, { "epoch": 4.117750439367311, "grad_norm": 0.12777348708659492, "learning_rate": 7.353235755581654e-06, "loss": 0.4716, "step": 2343 }, { "epoch": 4.119507908611599, "grad_norm": 0.10481627819449874, "learning_rate": 7.3248970107018565e-06, "loss": 0.471, "step": 2344 }, { "epoch": 4.121265377855887, "grad_norm": 0.11506799889268927, "learning_rate": 7.296607473936657e-06, "loss": 0.4817, "step": 2345 }, { "epoch": 4.1230228471001755, "grad_norm": 0.11763862814186771, "learning_rate": 7.268367187889582e-06, "loss": 0.4643, "step": 2346 }, { "epoch": 4.124780316344464, "grad_norm": 0.11661825334886883, "learning_rate": 7.240176195089992e-06, "loss": 0.4753, "step": 2347 }, { "epoch": 4.126537785588752, "grad_norm": 0.10621381951691966, "learning_rate": 7.212034537993e-06, "loss": 0.4651, "step": 2348 }, { "epoch": 4.1282952548330405, "grad_norm": 0.13531019362993185, "learning_rate": 7.183942258979457e-06, "loss": 0.468, "step": 2349 }, { "epoch": 4.130052724077329, "grad_norm": 0.12267381072924534, "learning_rate": 7.155899400355815e-06, "loss": 0.4792, "step": 2350 }, { "epoch": 4.131810193321617, "grad_norm": 0.10834768513246155, "learning_rate": 7.1279060043541124e-06, "loss": 0.4669, "step": 2351 }, { "epoch": 4.1335676625659055, "grad_norm": 0.1170597452144285, "learning_rate": 7.0999621131319e-06, "loss": 0.4641, "step": 2352 }, { "epoch": 4.135325131810193, "grad_norm": 0.1180784847497652, "learning_rate": 7.0720677687721616e-06, "loss": 0.4748, "step": 2353 }, { "epoch": 4.137082601054481, "grad_norm": 0.10238537154068629, "learning_rate": 7.044223013283304e-06, "loss": 0.472, "step": 2354 }, { "epoch": 4.1388400702987695, "grad_norm": 0.11274551016109438, "learning_rate": 7.016427888598998e-06, "loss": 0.469, "step": 2355 }, { "epoch": 4.140597539543058, "grad_norm": 0.12673465431577224, "learning_rate": 6.988682436578207e-06, "loss": 0.4809, "step": 2356 }, { "epoch": 4.142355008787346, "grad_norm": 0.11457179779143728, "learning_rate": 6.960986699005081e-06, "loss": 0.4782, "step": 2357 }, { "epoch": 4.1441124780316345, "grad_norm": 0.12004105239664205, "learning_rate": 6.933340717588892e-06, "loss": 0.4816, "step": 2358 }, { "epoch": 4.145869947275923, "grad_norm": 0.12116478331073224, "learning_rate": 6.905744533964008e-06, "loss": 0.4689, "step": 2359 }, { "epoch": 4.147627416520211, "grad_norm": 0.11226646585908581, "learning_rate": 6.878198189689773e-06, "loss": 0.4751, "step": 2360 }, { "epoch": 4.1493848857644995, "grad_norm": 0.1160094110906552, "learning_rate": 6.850701726250481e-06, "loss": 0.4834, "step": 2361 }, { "epoch": 4.151142355008787, "grad_norm": 0.11564202347109308, "learning_rate": 6.823255185055311e-06, "loss": 0.4677, "step": 2362 }, { "epoch": 4.152899824253075, "grad_norm": 0.12078044456664497, "learning_rate": 6.795858607438246e-06, "loss": 0.4733, "step": 2363 }, { "epoch": 4.154657293497364, "grad_norm": 0.11327151367897427, "learning_rate": 6.7685120346580615e-06, "loss": 0.4728, "step": 2364 }, { "epoch": 4.156414762741652, "grad_norm": 0.11273893206641596, "learning_rate": 6.7412155078981865e-06, "loss": 0.4856, "step": 2365 }, { "epoch": 4.15817223198594, "grad_norm": 0.10611691120965397, "learning_rate": 6.7139690682667125e-06, "loss": 0.4715, "step": 2366 }, { "epoch": 4.1599297012302285, "grad_norm": 0.11261654142728054, "learning_rate": 6.68677275679626e-06, "loss": 0.4609, "step": 2367 }, { "epoch": 4.161687170474517, "grad_norm": 0.1020074581061861, "learning_rate": 6.659626614443983e-06, "loss": 0.4901, "step": 2368 }, { "epoch": 4.163444639718805, "grad_norm": 0.10029155521925286, "learning_rate": 6.632530682091509e-06, "loss": 0.4751, "step": 2369 }, { "epoch": 4.1652021089630935, "grad_norm": 0.11194800631063974, "learning_rate": 6.6054850005448e-06, "loss": 0.485, "step": 2370 }, { "epoch": 4.166959578207381, "grad_norm": 0.11042723199068476, "learning_rate": 6.578489610534174e-06, "loss": 0.4689, "step": 2371 }, { "epoch": 4.168717047451669, "grad_norm": 0.11631650435543538, "learning_rate": 6.551544552714193e-06, "loss": 0.472, "step": 2372 }, { "epoch": 4.170474516695958, "grad_norm": 0.10823815464880165, "learning_rate": 6.524649867663626e-06, "loss": 0.4888, "step": 2373 }, { "epoch": 4.172231985940246, "grad_norm": 0.11846969870589115, "learning_rate": 6.497805595885393e-06, "loss": 0.475, "step": 2374 }, { "epoch": 4.173989455184534, "grad_norm": 0.10768704535765526, "learning_rate": 6.471011777806477e-06, "loss": 0.4743, "step": 2375 }, { "epoch": 4.175746924428823, "grad_norm": 0.12186267986294798, "learning_rate": 6.444268453777885e-06, "loss": 0.4803, "step": 2376 }, { "epoch": 4.177504393673111, "grad_norm": 0.12392699650803864, "learning_rate": 6.417575664074585e-06, "loss": 0.4686, "step": 2377 }, { "epoch": 4.179261862917399, "grad_norm": 0.10280198327299278, "learning_rate": 6.390933448895413e-06, "loss": 0.4694, "step": 2378 }, { "epoch": 4.1810193321616875, "grad_norm": 0.14615649518641033, "learning_rate": 6.3643418483630845e-06, "loss": 0.4723, "step": 2379 }, { "epoch": 4.182776801405975, "grad_norm": 0.11821181290024323, "learning_rate": 6.3378009025240675e-06, "loss": 0.471, "step": 2380 }, { "epoch": 4.184534270650263, "grad_norm": 0.11158554160152705, "learning_rate": 6.311310651348544e-06, "loss": 0.4719, "step": 2381 }, { "epoch": 4.186291739894552, "grad_norm": 0.12165300930270391, "learning_rate": 6.284871134730353e-06, "loss": 0.4873, "step": 2382 }, { "epoch": 4.18804920913884, "grad_norm": 0.12933890608542736, "learning_rate": 6.258482392486915e-06, "loss": 0.479, "step": 2383 }, { "epoch": 4.189806678383128, "grad_norm": 0.10738551790329887, "learning_rate": 6.232144464359229e-06, "loss": 0.4768, "step": 2384 }, { "epoch": 4.191564147627417, "grad_norm": 0.11892509307202402, "learning_rate": 6.205857390011716e-06, "loss": 0.4772, "step": 2385 }, { "epoch": 4.193321616871705, "grad_norm": 0.13101959295661952, "learning_rate": 6.179621209032247e-06, "loss": 0.466, "step": 2386 }, { "epoch": 4.195079086115993, "grad_norm": 0.10069183336109241, "learning_rate": 6.153435960932026e-06, "loss": 0.4681, "step": 2387 }, { "epoch": 4.1968365553602816, "grad_norm": 0.1152635001851318, "learning_rate": 6.12730168514557e-06, "loss": 0.4749, "step": 2388 }, { "epoch": 4.198594024604569, "grad_norm": 0.12287261666308742, "learning_rate": 6.1012184210306236e-06, "loss": 0.4643, "step": 2389 }, { "epoch": 4.200351493848857, "grad_norm": 0.12055662645805292, "learning_rate": 6.075186207868116e-06, "loss": 0.4763, "step": 2390 }, { "epoch": 4.202108963093146, "grad_norm": 0.11249392489155229, "learning_rate": 6.049205084862082e-06, "loss": 0.4738, "step": 2391 }, { "epoch": 4.203866432337434, "grad_norm": 0.13379159002378496, "learning_rate": 6.023275091139629e-06, "loss": 0.4685, "step": 2392 }, { "epoch": 4.205623901581722, "grad_norm": 0.12295334224734122, "learning_rate": 5.997396265750844e-06, "loss": 0.481, "step": 2393 }, { "epoch": 4.207381370826011, "grad_norm": 0.10853622992361496, "learning_rate": 5.971568647668795e-06, "loss": 0.486, "step": 2394 }, { "epoch": 4.209138840070299, "grad_norm": 0.1364900488033456, "learning_rate": 5.945792275789392e-06, "loss": 0.4728, "step": 2395 }, { "epoch": 4.210896309314587, "grad_norm": 0.12590847394581606, "learning_rate": 5.920067188931398e-06, "loss": 0.4774, "step": 2396 }, { "epoch": 4.212653778558876, "grad_norm": 0.1210214619807724, "learning_rate": 5.894393425836314e-06, "loss": 0.4852, "step": 2397 }, { "epoch": 4.214411247803163, "grad_norm": 0.10928578673874867, "learning_rate": 5.868771025168371e-06, "loss": 0.4746, "step": 2398 }, { "epoch": 4.216168717047451, "grad_norm": 0.12450212647975341, "learning_rate": 5.843200025514439e-06, "loss": 0.4681, "step": 2399 }, { "epoch": 4.21792618629174, "grad_norm": 0.12393628497307128, "learning_rate": 5.817680465383984e-06, "loss": 0.4867, "step": 2400 }, { "epoch": 4.219683655536028, "grad_norm": 0.13121085512150238, "learning_rate": 5.792212383208999e-06, "loss": 0.48, "step": 2401 }, { "epoch": 4.221441124780316, "grad_norm": 0.14924632920651051, "learning_rate": 5.766795817343962e-06, "loss": 0.4735, "step": 2402 }, { "epoch": 4.223198594024605, "grad_norm": 0.1184295403558219, "learning_rate": 5.741430806065742e-06, "loss": 0.4716, "step": 2403 }, { "epoch": 4.224956063268893, "grad_norm": 0.10327629545351014, "learning_rate": 5.716117387573614e-06, "loss": 0.4858, "step": 2404 }, { "epoch": 4.226713532513181, "grad_norm": 0.14497818787726297, "learning_rate": 5.690855599989121e-06, "loss": 0.4751, "step": 2405 }, { "epoch": 4.22847100175747, "grad_norm": 0.12898017013441837, "learning_rate": 5.66564548135605e-06, "loss": 0.4795, "step": 2406 }, { "epoch": 4.230228471001757, "grad_norm": 0.11573652141865091, "learning_rate": 5.64048706964039e-06, "loss": 0.4827, "step": 2407 }, { "epoch": 4.231985940246045, "grad_norm": 0.14286448591594547, "learning_rate": 5.615380402730246e-06, "loss": 0.4773, "step": 2408 }, { "epoch": 4.233743409490334, "grad_norm": 0.13827100376897228, "learning_rate": 5.5903255184358265e-06, "loss": 0.4778, "step": 2409 }, { "epoch": 4.235500878734622, "grad_norm": 0.113701915966414, "learning_rate": 5.565322454489317e-06, "loss": 0.4742, "step": 2410 }, { "epoch": 4.23725834797891, "grad_norm": 0.11552630660158722, "learning_rate": 5.540371248544878e-06, "loss": 0.4838, "step": 2411 }, { "epoch": 4.239015817223199, "grad_norm": 0.16852338568366895, "learning_rate": 5.51547193817858e-06, "loss": 0.4755, "step": 2412 }, { "epoch": 4.240773286467487, "grad_norm": 0.12567170329922261, "learning_rate": 5.490624560888327e-06, "loss": 0.4935, "step": 2413 }, { "epoch": 4.242530755711775, "grad_norm": 0.1312366663111067, "learning_rate": 5.465829154093838e-06, "loss": 0.4794, "step": 2414 }, { "epoch": 4.244288224956064, "grad_norm": 0.11978670478385176, "learning_rate": 5.441085755136533e-06, "loss": 0.4868, "step": 2415 }, { "epoch": 4.246045694200351, "grad_norm": 0.11119728607683435, "learning_rate": 5.4163944012795275e-06, "loss": 0.471, "step": 2416 }, { "epoch": 4.247803163444639, "grad_norm": 0.1676600631053803, "learning_rate": 5.391755129707559e-06, "loss": 0.4871, "step": 2417 }, { "epoch": 4.249560632688928, "grad_norm": 0.11445893276753037, "learning_rate": 5.367167977526904e-06, "loss": 0.4655, "step": 2418 }, { "epoch": 4.251318101933216, "grad_norm": 0.11493703791665373, "learning_rate": 5.342632981765401e-06, "loss": 0.4749, "step": 2419 }, { "epoch": 4.253075571177504, "grad_norm": 0.1459666154736967, "learning_rate": 5.3181501793722904e-06, "loss": 0.4789, "step": 2420 }, { "epoch": 4.254833040421793, "grad_norm": 0.10911237283283072, "learning_rate": 5.293719607218246e-06, "loss": 0.4803, "step": 2421 }, { "epoch": 4.256590509666081, "grad_norm": 0.17039143894755737, "learning_rate": 5.269341302095248e-06, "loss": 0.4773, "step": 2422 }, { "epoch": 4.258347978910369, "grad_norm": 0.1617999952657475, "learning_rate": 5.2450153007165846e-06, "loss": 0.4821, "step": 2423 }, { "epoch": 4.260105448154658, "grad_norm": 0.12502196145966307, "learning_rate": 5.220741639716789e-06, "loss": 0.4742, "step": 2424 }, { "epoch": 4.261862917398945, "grad_norm": 0.1533121967228406, "learning_rate": 5.196520355651547e-06, "loss": 0.478, "step": 2425 }, { "epoch": 4.263620386643233, "grad_norm": 0.1287598865435371, "learning_rate": 5.172351484997676e-06, "loss": 0.4786, "step": 2426 }, { "epoch": 4.265377855887522, "grad_norm": 0.10724424896068108, "learning_rate": 5.14823506415306e-06, "loss": 0.4842, "step": 2427 }, { "epoch": 4.26713532513181, "grad_norm": 0.18116263667107063, "learning_rate": 5.124171129436582e-06, "loss": 0.476, "step": 2428 }, { "epoch": 4.268892794376098, "grad_norm": 0.16310583482222363, "learning_rate": 5.100159717088114e-06, "loss": 0.4878, "step": 2429 }, { "epoch": 4.270650263620387, "grad_norm": 0.11837474079230441, "learning_rate": 5.076200863268397e-06, "loss": 0.4745, "step": 2430 }, { "epoch": 4.272407732864675, "grad_norm": 0.14070170603426246, "learning_rate": 5.052294604059027e-06, "loss": 0.4746, "step": 2431 }, { "epoch": 4.274165202108963, "grad_norm": 0.13003889960742473, "learning_rate": 5.02844097546241e-06, "loss": 0.4735, "step": 2432 }, { "epoch": 4.275922671353252, "grad_norm": 0.10363072286068417, "learning_rate": 5.004640013401671e-06, "loss": 0.4759, "step": 2433 }, { "epoch": 4.277680140597539, "grad_norm": 0.18060701363588635, "learning_rate": 4.9808917537206295e-06, "loss": 0.4644, "step": 2434 }, { "epoch": 4.279437609841827, "grad_norm": 0.1584946754669663, "learning_rate": 4.9571962321837405e-06, "loss": 0.4664, "step": 2435 }, { "epoch": 4.281195079086116, "grad_norm": 0.12598422803718792, "learning_rate": 4.933553484476026e-06, "loss": 0.4725, "step": 2436 }, { "epoch": 4.282952548330404, "grad_norm": 0.15687025317964912, "learning_rate": 4.909963546203038e-06, "loss": 0.4865, "step": 2437 }, { "epoch": 4.284710017574692, "grad_norm": 0.13050328048281593, "learning_rate": 4.886426452890787e-06, "loss": 0.4837, "step": 2438 }, { "epoch": 4.286467486818981, "grad_norm": 0.11551114838959345, "learning_rate": 4.8629422399857304e-06, "loss": 0.4809, "step": 2439 }, { "epoch": 4.288224956063269, "grad_norm": 0.16925505604318608, "learning_rate": 4.83951094285466e-06, "loss": 0.4683, "step": 2440 }, { "epoch": 4.289982425307557, "grad_norm": 0.16651281522601563, "learning_rate": 4.816132596784684e-06, "loss": 0.4677, "step": 2441 }, { "epoch": 4.291739894551846, "grad_norm": 0.12304385567266805, "learning_rate": 4.79280723698317e-06, "loss": 0.4792, "step": 2442 }, { "epoch": 4.293497363796133, "grad_norm": 0.15420566041903838, "learning_rate": 4.76953489857769e-06, "loss": 0.4803, "step": 2443 }, { "epoch": 4.295254833040421, "grad_norm": 0.11825596874872828, "learning_rate": 4.7463156166159595e-06, "loss": 0.4882, "step": 2444 }, { "epoch": 4.29701230228471, "grad_norm": 0.13506254616069893, "learning_rate": 4.723149426065803e-06, "loss": 0.477, "step": 2445 }, { "epoch": 4.298769771528998, "grad_norm": 0.13610203948360655, "learning_rate": 4.700036361815081e-06, "loss": 0.4795, "step": 2446 }, { "epoch": 4.300527240773286, "grad_norm": 0.10780487171451766, "learning_rate": 4.676976458671654e-06, "loss": 0.4831, "step": 2447 }, { "epoch": 4.302284710017575, "grad_norm": 0.1203522779339123, "learning_rate": 4.65396975136331e-06, "loss": 0.4604, "step": 2448 }, { "epoch": 4.304042179261863, "grad_norm": 0.11401597191066483, "learning_rate": 4.631016274537752e-06, "loss": 0.483, "step": 2449 }, { "epoch": 4.305799648506151, "grad_norm": 0.11916930304839775, "learning_rate": 4.608116062762489e-06, "loss": 0.4633, "step": 2450 }, { "epoch": 4.30755711775044, "grad_norm": 0.15425489648147625, "learning_rate": 4.585269150524832e-06, "loss": 0.4749, "step": 2451 }, { "epoch": 4.309314586994727, "grad_norm": 0.14245133404495625, "learning_rate": 4.562475572231808e-06, "loss": 0.4764, "step": 2452 }, { "epoch": 4.3110720562390155, "grad_norm": 0.12217382111378874, "learning_rate": 4.539735362210147e-06, "loss": 0.4729, "step": 2453 }, { "epoch": 4.312829525483304, "grad_norm": 0.1281202455710563, "learning_rate": 4.517048554706182e-06, "loss": 0.4758, "step": 2454 }, { "epoch": 4.314586994727592, "grad_norm": 0.15812877588108648, "learning_rate": 4.494415183885848e-06, "loss": 0.4772, "step": 2455 }, { "epoch": 4.31634446397188, "grad_norm": 0.14154648889709048, "learning_rate": 4.471835283834583e-06, "loss": 0.4895, "step": 2456 }, { "epoch": 4.318101933216169, "grad_norm": 0.19198824889224214, "learning_rate": 4.449308888557307e-06, "loss": 0.4699, "step": 2457 }, { "epoch": 4.319859402460457, "grad_norm": 0.14823775604259037, "learning_rate": 4.426836031978363e-06, "loss": 0.4646, "step": 2458 }, { "epoch": 4.321616871704745, "grad_norm": 0.1187066367990641, "learning_rate": 4.40441674794148e-06, "loss": 0.4687, "step": 2459 }, { "epoch": 4.323374340949034, "grad_norm": 0.13802613901281355, "learning_rate": 4.382051070209685e-06, "loss": 0.4693, "step": 2460 }, { "epoch": 4.325131810193321, "grad_norm": 0.11888915347555443, "learning_rate": 4.359739032465289e-06, "loss": 0.4754, "step": 2461 }, { "epoch": 4.3268892794376095, "grad_norm": 0.1256480214942587, "learning_rate": 4.337480668309813e-06, "loss": 0.4777, "step": 2462 }, { "epoch": 4.328646748681898, "grad_norm": 0.14403425783638318, "learning_rate": 4.315276011263944e-06, "loss": 0.479, "step": 2463 }, { "epoch": 4.330404217926186, "grad_norm": 0.14297493595364938, "learning_rate": 4.293125094767519e-06, "loss": 0.4748, "step": 2464 }, { "epoch": 4.3321616871704745, "grad_norm": 0.10688837389860426, "learning_rate": 4.2710279521793914e-06, "loss": 0.4682, "step": 2465 }, { "epoch": 4.333919156414763, "grad_norm": 0.12087274047326955, "learning_rate": 4.248984616777474e-06, "loss": 0.4799, "step": 2466 }, { "epoch": 4.335676625659051, "grad_norm": 0.12214935404171759, "learning_rate": 4.226995121758624e-06, "loss": 0.4755, "step": 2467 }, { "epoch": 4.337434094903339, "grad_norm": 0.11588257041489199, "learning_rate": 4.2050595002386175e-06, "loss": 0.4817, "step": 2468 }, { "epoch": 4.339191564147628, "grad_norm": 0.12270608091801564, "learning_rate": 4.183177785252124e-06, "loss": 0.4622, "step": 2469 }, { "epoch": 4.340949033391915, "grad_norm": 0.11585770046856178, "learning_rate": 4.161350009752596e-06, "loss": 0.4788, "step": 2470 }, { "epoch": 4.3427065026362035, "grad_norm": 0.11029327116926607, "learning_rate": 4.139576206612272e-06, "loss": 0.4753, "step": 2471 }, { "epoch": 4.344463971880492, "grad_norm": 0.1278542405890111, "learning_rate": 4.1178564086221095e-06, "loss": 0.4857, "step": 2472 }, { "epoch": 4.34622144112478, "grad_norm": 0.11603747894178841, "learning_rate": 4.09619064849172e-06, "loss": 0.4657, "step": 2473 }, { "epoch": 4.3479789103690685, "grad_norm": 0.11024278470815649, "learning_rate": 4.074578958849365e-06, "loss": 0.4795, "step": 2474 }, { "epoch": 4.349736379613357, "grad_norm": 0.11669333020051328, "learning_rate": 4.053021372241843e-06, "loss": 0.4806, "step": 2475 }, { "epoch": 4.351493848857645, "grad_norm": 0.11535267292091808, "learning_rate": 4.031517921134511e-06, "loss": 0.4699, "step": 2476 }, { "epoch": 4.353251318101933, "grad_norm": 0.12232709185005676, "learning_rate": 4.010068637911149e-06, "loss": 0.461, "step": 2477 }, { "epoch": 4.355008787346222, "grad_norm": 0.13238921984109384, "learning_rate": 3.988673554874001e-06, "loss": 0.4769, "step": 2478 }, { "epoch": 4.356766256590509, "grad_norm": 0.13237425942244252, "learning_rate": 3.9673327042436805e-06, "loss": 0.4746, "step": 2479 }, { "epoch": 4.3585237258347975, "grad_norm": 0.11981673190077721, "learning_rate": 3.946046118159123e-06, "loss": 0.4679, "step": 2480 }, { "epoch": 4.360281195079086, "grad_norm": 0.11219510854247149, "learning_rate": 3.9248138286775364e-06, "loss": 0.4873, "step": 2481 }, { "epoch": 4.362038664323374, "grad_norm": 0.1315045747277651, "learning_rate": 3.903635867774371e-06, "loss": 0.4795, "step": 2482 }, { "epoch": 4.3637961335676625, "grad_norm": 0.12834601313908245, "learning_rate": 3.882512267343246e-06, "loss": 0.4655, "step": 2483 }, { "epoch": 4.365553602811951, "grad_norm": 0.13053051999188256, "learning_rate": 3.861443059195931e-06, "loss": 0.4797, "step": 2484 }, { "epoch": 4.367311072056239, "grad_norm": 0.12850664957947938, "learning_rate": 3.840428275062267e-06, "loss": 0.4764, "step": 2485 }, { "epoch": 4.3690685413005275, "grad_norm": 0.11592182036453903, "learning_rate": 3.819467946590139e-06, "loss": 0.4736, "step": 2486 }, { "epoch": 4.370826010544816, "grad_norm": 0.1134703939461087, "learning_rate": 3.7985621053454247e-06, "loss": 0.4716, "step": 2487 }, { "epoch": 4.372583479789103, "grad_norm": 0.12350514906845948, "learning_rate": 3.7777107828119452e-06, "loss": 0.484, "step": 2488 }, { "epoch": 4.3743409490333915, "grad_norm": 0.11128693962489009, "learning_rate": 3.7569140103914124e-06, "loss": 0.4713, "step": 2489 }, { "epoch": 4.37609841827768, "grad_norm": 0.13481050706209785, "learning_rate": 3.736171819403387e-06, "loss": 0.4767, "step": 2490 }, { "epoch": 4.377855887521968, "grad_norm": 0.14827872600166309, "learning_rate": 3.715484241085241e-06, "loss": 0.4748, "step": 2491 }, { "epoch": 4.3796133567662565, "grad_norm": 0.12823623760012132, "learning_rate": 3.694851306592089e-06, "loss": 0.469, "step": 2492 }, { "epoch": 4.381370826010545, "grad_norm": 0.11431966809023439, "learning_rate": 3.674273046996746e-06, "loss": 0.472, "step": 2493 }, { "epoch": 4.383128295254833, "grad_norm": 0.12507057243992728, "learning_rate": 3.65374949328972e-06, "loss": 0.4746, "step": 2494 }, { "epoch": 4.3848857644991215, "grad_norm": 0.14427744592469377, "learning_rate": 3.633280676379105e-06, "loss": 0.4663, "step": 2495 }, { "epoch": 4.38664323374341, "grad_norm": 0.13579005789227355, "learning_rate": 3.6128666270905676e-06, "loss": 0.4878, "step": 2496 }, { "epoch": 4.388400702987697, "grad_norm": 0.11276891807720503, "learning_rate": 3.5925073761672936e-06, "loss": 0.4745, "step": 2497 }, { "epoch": 4.390158172231986, "grad_norm": 0.11422827226346834, "learning_rate": 3.572202954269952e-06, "loss": 0.4717, "step": 2498 }, { "epoch": 4.391915641476274, "grad_norm": 0.11730448731666311, "learning_rate": 3.5519533919766303e-06, "loss": 0.4814, "step": 2499 }, { "epoch": 4.393673110720562, "grad_norm": 0.13669102707642358, "learning_rate": 3.5317587197828097e-06, "loss": 0.4702, "step": 2500 }, { "epoch": 4.3954305799648505, "grad_norm": 0.10323879886864211, "learning_rate": 3.5116189681012956e-06, "loss": 0.4674, "step": 2501 }, { "epoch": 4.397188049209139, "grad_norm": 0.10558694568434875, "learning_rate": 3.491534167262196e-06, "loss": 0.4737, "step": 2502 }, { "epoch": 4.398945518453427, "grad_norm": 0.13716017793431087, "learning_rate": 3.471504347512844e-06, "loss": 0.4741, "step": 2503 }, { "epoch": 4.4007029876977155, "grad_norm": 0.1307084133993296, "learning_rate": 3.4515295390178085e-06, "loss": 0.4599, "step": 2504 }, { "epoch": 4.402460456942004, "grad_norm": 0.11945205366842844, "learning_rate": 3.4316097718587727e-06, "loss": 0.474, "step": 2505 }, { "epoch": 4.404217926186292, "grad_norm": 0.11120427161627375, "learning_rate": 3.4117450760345584e-06, "loss": 0.4792, "step": 2506 }, { "epoch": 4.40597539543058, "grad_norm": 0.10873253199760173, "learning_rate": 3.391935481461026e-06, "loss": 0.4724, "step": 2507 }, { "epoch": 4.407732864674868, "grad_norm": 0.13111699297193766, "learning_rate": 3.372181017971077e-06, "loss": 0.4708, "step": 2508 }, { "epoch": 4.409490333919156, "grad_norm": 0.10572214602020835, "learning_rate": 3.352481715314566e-06, "loss": 0.4767, "step": 2509 }, { "epoch": 4.411247803163445, "grad_norm": 0.11908011054336685, "learning_rate": 3.3328376031582967e-06, "loss": 0.4715, "step": 2510 }, { "epoch": 4.413005272407733, "grad_norm": 0.09482435058579816, "learning_rate": 3.3132487110859456e-06, "loss": 0.4818, "step": 2511 }, { "epoch": 4.414762741652021, "grad_norm": 0.10915586569203503, "learning_rate": 3.2937150685980226e-06, "loss": 0.4766, "step": 2512 }, { "epoch": 4.4165202108963095, "grad_norm": 0.11232873870739767, "learning_rate": 3.274236705111844e-06, "loss": 0.481, "step": 2513 }, { "epoch": 4.418277680140598, "grad_norm": 0.1044554364075346, "learning_rate": 3.254813649961479e-06, "loss": 0.4737, "step": 2514 }, { "epoch": 4.420035149384886, "grad_norm": 0.13521977558490222, "learning_rate": 3.2354459323976893e-06, "loss": 0.4676, "step": 2515 }, { "epoch": 4.421792618629174, "grad_norm": 0.12803884691749512, "learning_rate": 3.216133581587917e-06, "loss": 0.4722, "step": 2516 }, { "epoch": 4.423550087873462, "grad_norm": 0.11946753924240346, "learning_rate": 3.1968766266162075e-06, "loss": 0.4891, "step": 2517 }, { "epoch": 4.42530755711775, "grad_norm": 0.11638446211739926, "learning_rate": 3.177675096483177e-06, "loss": 0.4753, "step": 2518 }, { "epoch": 4.427065026362039, "grad_norm": 0.11087234308392022, "learning_rate": 3.1585290201060046e-06, "loss": 0.4745, "step": 2519 }, { "epoch": 4.428822495606327, "grad_norm": 0.09525592842748508, "learning_rate": 3.1394384263183288e-06, "loss": 0.4688, "step": 2520 }, { "epoch": 4.430579964850615, "grad_norm": 0.1350532176757668, "learning_rate": 3.120403343870226e-06, "loss": 0.4753, "step": 2521 }, { "epoch": 4.4323374340949035, "grad_norm": 0.14064460893160172, "learning_rate": 3.1014238014281937e-06, "loss": 0.4719, "step": 2522 }, { "epoch": 4.434094903339192, "grad_norm": 0.09720135805865704, "learning_rate": 3.082499827575074e-06, "loss": 0.4652, "step": 2523 }, { "epoch": 4.43585237258348, "grad_norm": 0.10920269169959194, "learning_rate": 3.0636314508100474e-06, "loss": 0.4793, "step": 2524 }, { "epoch": 4.437609841827768, "grad_norm": 0.10638481008145269, "learning_rate": 3.0448186995485307e-06, "loss": 0.4673, "step": 2525 }, { "epoch": 4.439367311072056, "grad_norm": 0.10248077900146271, "learning_rate": 3.026061602122199e-06, "loss": 0.4758, "step": 2526 }, { "epoch": 4.441124780316344, "grad_norm": 0.14779754780981794, "learning_rate": 3.0073601867788917e-06, "loss": 0.4649, "step": 2527 }, { "epoch": 4.442882249560633, "grad_norm": 0.09510765696037683, "learning_rate": 2.988714481682604e-06, "loss": 0.4763, "step": 2528 }, { "epoch": 4.444639718804921, "grad_norm": 0.10828146059531848, "learning_rate": 2.9701245149134394e-06, "loss": 0.4651, "step": 2529 }, { "epoch": 4.446397188049209, "grad_norm": 0.1025667652526198, "learning_rate": 2.951590314467545e-06, "loss": 0.4641, "step": 2530 }, { "epoch": 4.448154657293498, "grad_norm": 0.10133466767486621, "learning_rate": 2.9331119082571003e-06, "loss": 0.4686, "step": 2531 }, { "epoch": 4.449912126537786, "grad_norm": 0.09924308406175934, "learning_rate": 2.91468932411024e-06, "loss": 0.4797, "step": 2532 }, { "epoch": 4.451669595782074, "grad_norm": 0.13675010898312204, "learning_rate": 2.896322589771039e-06, "loss": 0.4819, "step": 2533 }, { "epoch": 4.453427065026362, "grad_norm": 0.10259364506877604, "learning_rate": 2.8780117328994817e-06, "loss": 0.4612, "step": 2534 }, { "epoch": 4.45518453427065, "grad_norm": 0.10788503912299131, "learning_rate": 2.859756781071381e-06, "loss": 0.4699, "step": 2535 }, { "epoch": 4.456942003514938, "grad_norm": 0.09291932307277351, "learning_rate": 2.841557761778364e-06, "loss": 0.4658, "step": 2536 }, { "epoch": 4.458699472759227, "grad_norm": 0.09762159779714302, "learning_rate": 2.8234147024278267e-06, "loss": 0.4782, "step": 2537 }, { "epoch": 4.460456942003515, "grad_norm": 0.14417791727330806, "learning_rate": 2.805327630342878e-06, "loss": 0.4744, "step": 2538 }, { "epoch": 4.462214411247803, "grad_norm": 0.09153715863844974, "learning_rate": 2.7872965727623413e-06, "loss": 0.4778, "step": 2539 }, { "epoch": 4.463971880492092, "grad_norm": 0.10440093173883881, "learning_rate": 2.7693215568406516e-06, "loss": 0.4747, "step": 2540 }, { "epoch": 4.46572934973638, "grad_norm": 0.09836297226632226, "learning_rate": 2.7514026096478575e-06, "loss": 0.4736, "step": 2541 }, { "epoch": 4.467486818980668, "grad_norm": 0.08935440620647878, "learning_rate": 2.7335397581695724e-06, "loss": 0.4625, "step": 2542 }, { "epoch": 4.469244288224956, "grad_norm": 0.12686348537914513, "learning_rate": 2.715733029306931e-06, "loss": 0.4771, "step": 2543 }, { "epoch": 4.471001757469244, "grad_norm": 0.0929085784515153, "learning_rate": 2.697982449876535e-06, "loss": 0.4664, "step": 2544 }, { "epoch": 4.472759226713532, "grad_norm": 0.105249611360165, "learning_rate": 2.6802880466104465e-06, "loss": 0.4905, "step": 2545 }, { "epoch": 4.474516695957821, "grad_norm": 0.10053758667457215, "learning_rate": 2.6626498461561136e-06, "loss": 0.4794, "step": 2546 }, { "epoch": 4.476274165202109, "grad_norm": 0.12655454758362863, "learning_rate": 2.645067875076346e-06, "loss": 0.4712, "step": 2547 }, { "epoch": 4.478031634446397, "grad_norm": 0.1008740560167108, "learning_rate": 2.6275421598492657e-06, "loss": 0.4633, "step": 2548 }, { "epoch": 4.479789103690686, "grad_norm": 0.1142883724990469, "learning_rate": 2.6100727268683025e-06, "loss": 0.4725, "step": 2549 }, { "epoch": 4.481546572934974, "grad_norm": 0.09754092834737228, "learning_rate": 2.5926596024420957e-06, "loss": 0.4754, "step": 2550 }, { "epoch": 4.483304042179262, "grad_norm": 0.0975129205334943, "learning_rate": 2.5753028127945e-06, "loss": 0.4688, "step": 2551 }, { "epoch": 4.48506151142355, "grad_norm": 0.1305557989474458, "learning_rate": 2.558002384064522e-06, "loss": 0.4719, "step": 2552 }, { "epoch": 4.486818980667838, "grad_norm": 0.09554315539713312, "learning_rate": 2.5407583423062932e-06, "loss": 0.4932, "step": 2553 }, { "epoch": 4.488576449912126, "grad_norm": 0.09124931115581991, "learning_rate": 2.5235707134890364e-06, "loss": 0.4708, "step": 2554 }, { "epoch": 4.490333919156415, "grad_norm": 0.09631419592481577, "learning_rate": 2.506439523497006e-06, "loss": 0.4738, "step": 2555 }, { "epoch": 4.492091388400703, "grad_norm": 0.09565524179009476, "learning_rate": 2.4893647981294633e-06, "loss": 0.4779, "step": 2556 }, { "epoch": 4.493848857644991, "grad_norm": 0.09021063626896801, "learning_rate": 2.472346563100638e-06, "loss": 0.4906, "step": 2557 }, { "epoch": 4.49560632688928, "grad_norm": 0.09003358060360184, "learning_rate": 2.4553848440396787e-06, "loss": 0.4744, "step": 2558 }, { "epoch": 4.497363796133568, "grad_norm": 0.14656883790244565, "learning_rate": 2.4384796664906406e-06, "loss": 0.486, "step": 2559 }, { "epoch": 4.499121265377856, "grad_norm": 0.09020797884096664, "learning_rate": 2.4216310559124035e-06, "loss": 0.4781, "step": 2560 }, { "epoch": 4.500878734622145, "grad_norm": 0.08989417508018989, "learning_rate": 2.404839037678679e-06, "loss": 0.4645, "step": 2561 }, { "epoch": 4.502636203866432, "grad_norm": 0.09951886985944082, "learning_rate": 2.3881036370779364e-06, "loss": 0.4697, "step": 2562 }, { "epoch": 4.50439367311072, "grad_norm": 0.0931308692203395, "learning_rate": 2.3714248793133886e-06, "loss": 0.4722, "step": 2563 }, { "epoch": 4.506151142355009, "grad_norm": 0.09860232479171328, "learning_rate": 2.354802789502948e-06, "loss": 0.4764, "step": 2564 }, { "epoch": 4.507908611599297, "grad_norm": 0.13743830534054277, "learning_rate": 2.3382373926791722e-06, "loss": 0.4737, "step": 2565 }, { "epoch": 4.509666080843585, "grad_norm": 0.0965360819728103, "learning_rate": 2.3217287137892573e-06, "loss": 0.4736, "step": 2566 }, { "epoch": 4.511423550087874, "grad_norm": 0.08917646289704878, "learning_rate": 2.3052767776949737e-06, "loss": 0.4755, "step": 2567 }, { "epoch": 4.513181019332162, "grad_norm": 0.10011482371874779, "learning_rate": 2.2888816091726307e-06, "loss": 0.4761, "step": 2568 }, { "epoch": 4.514938488576449, "grad_norm": 0.09060551405265217, "learning_rate": 2.2725432329130693e-06, "loss": 0.4805, "step": 2569 }, { "epoch": 4.516695957820739, "grad_norm": 0.09686807890779883, "learning_rate": 2.2562616735215848e-06, "loss": 0.4616, "step": 2570 }, { "epoch": 4.518453427065026, "grad_norm": 0.14219675198825493, "learning_rate": 2.2400369555179103e-06, "loss": 0.4636, "step": 2571 }, { "epoch": 4.520210896309314, "grad_norm": 0.09117681095192648, "learning_rate": 2.2238691033361804e-06, "loss": 0.4784, "step": 2572 }, { "epoch": 4.521968365553603, "grad_norm": 0.09483836081877949, "learning_rate": 2.207758141324878e-06, "loss": 0.4796, "step": 2573 }, { "epoch": 4.523725834797891, "grad_norm": 0.10240353982712314, "learning_rate": 2.1917040937468315e-06, "loss": 0.4736, "step": 2574 }, { "epoch": 4.525483304042179, "grad_norm": 0.1004398414999113, "learning_rate": 2.1757069847791535e-06, "loss": 0.4796, "step": 2575 }, { "epoch": 4.527240773286468, "grad_norm": 0.09256148248567904, "learning_rate": 2.159766838513182e-06, "loss": 0.4671, "step": 2576 }, { "epoch": 4.528998242530756, "grad_norm": 0.08810623398358088, "learning_rate": 2.143883678954497e-06, "loss": 0.4725, "step": 2577 }, { "epoch": 4.530755711775043, "grad_norm": 0.09137519566714504, "learning_rate": 2.1280575300228444e-06, "loss": 0.4827, "step": 2578 }, { "epoch": 4.532513181019333, "grad_norm": 0.09722559085938932, "learning_rate": 2.112288415552133e-06, "loss": 0.4722, "step": 2579 }, { "epoch": 4.53427065026362, "grad_norm": 0.09200353507600288, "learning_rate": 2.0965763592903564e-06, "loss": 0.4717, "step": 2580 }, { "epoch": 4.536028119507908, "grad_norm": 0.10155560637937568, "learning_rate": 2.080921384899588e-06, "loss": 0.4733, "step": 2581 }, { "epoch": 4.537785588752197, "grad_norm": 0.12197229298346372, "learning_rate": 2.065323515955933e-06, "loss": 0.4721, "step": 2582 }, { "epoch": 4.539543057996485, "grad_norm": 0.09377880949892929, "learning_rate": 2.0497827759494936e-06, "loss": 0.4782, "step": 2583 }, { "epoch": 4.541300527240773, "grad_norm": 0.0904348978113528, "learning_rate": 2.034299188284363e-06, "loss": 0.4787, "step": 2584 }, { "epoch": 4.543057996485062, "grad_norm": 0.092486318109831, "learning_rate": 2.018872776278533e-06, "loss": 0.4824, "step": 2585 }, { "epoch": 4.54481546572935, "grad_norm": 0.08715523861077847, "learning_rate": 2.0035035631639e-06, "loss": 0.473, "step": 2586 }, { "epoch": 4.546572934973638, "grad_norm": 0.09516810398512435, "learning_rate": 1.988191572086229e-06, "loss": 0.4669, "step": 2587 }, { "epoch": 4.548330404217927, "grad_norm": 0.08655183196758495, "learning_rate": 1.9729368261050873e-06, "loss": 0.4718, "step": 2588 }, { "epoch": 4.550087873462214, "grad_norm": 0.08827002623769377, "learning_rate": 1.957739348193859e-06, "loss": 0.4696, "step": 2589 }, { "epoch": 4.551845342706502, "grad_norm": 0.11847006027774645, "learning_rate": 1.9425991612396668e-06, "loss": 0.4852, "step": 2590 }, { "epoch": 4.553602811950791, "grad_norm": 0.14174404135400354, "learning_rate": 1.927516288043361e-06, "loss": 0.4702, "step": 2591 }, { "epoch": 4.555360281195079, "grad_norm": 0.08885369507470578, "learning_rate": 1.9124907513194736e-06, "loss": 0.4781, "step": 2592 }, { "epoch": 4.557117750439367, "grad_norm": 0.09608539538544204, "learning_rate": 1.897522573696189e-06, "loss": 0.4738, "step": 2593 }, { "epoch": 4.558875219683656, "grad_norm": 0.0966961445122232, "learning_rate": 1.8826117777153151e-06, "loss": 0.4742, "step": 2594 }, { "epoch": 4.560632688927944, "grad_norm": 0.10784214695642533, "learning_rate": 1.8677583858322457e-06, "loss": 0.4685, "step": 2595 }, { "epoch": 4.562390158172232, "grad_norm": 0.09016257127109381, "learning_rate": 1.852962420415918e-06, "loss": 0.4794, "step": 2596 }, { "epoch": 4.564147627416521, "grad_norm": 0.09526032742481386, "learning_rate": 1.8382239037487837e-06, "loss": 0.4734, "step": 2597 }, { "epoch": 4.565905096660808, "grad_norm": 0.1017927343987018, "learning_rate": 1.8235428580267855e-06, "loss": 0.4862, "step": 2598 }, { "epoch": 4.5676625659050965, "grad_norm": 0.09530749020069576, "learning_rate": 1.8089193053593135e-06, "loss": 0.474, "step": 2599 }, { "epoch": 4.569420035149385, "grad_norm": 0.15244351164101905, "learning_rate": 1.7943532677691734e-06, "loss": 0.4725, "step": 2600 }, { "epoch": 4.571177504393673, "grad_norm": 0.12619979960731084, "learning_rate": 1.7798447671925555e-06, "loss": 0.4694, "step": 2601 }, { "epoch": 4.572934973637961, "grad_norm": 0.11132734486885938, "learning_rate": 1.7653938254789959e-06, "loss": 0.4837, "step": 2602 }, { "epoch": 4.57469244288225, "grad_norm": 0.08645776060927424, "learning_rate": 1.7510004643913526e-06, "loss": 0.4788, "step": 2603 }, { "epoch": 4.576449912126538, "grad_norm": 0.09802332453993677, "learning_rate": 1.7366647056057707e-06, "loss": 0.4821, "step": 2604 }, { "epoch": 4.578207381370826, "grad_norm": 0.09897267854146719, "learning_rate": 1.7223865707116472e-06, "loss": 0.4735, "step": 2605 }, { "epoch": 4.579964850615115, "grad_norm": 0.09966254543959198, "learning_rate": 1.7081660812115864e-06, "loss": 0.4673, "step": 2606 }, { "epoch": 4.581722319859402, "grad_norm": 0.08871830588498256, "learning_rate": 1.6940032585213907e-06, "loss": 0.4671, "step": 2607 }, { "epoch": 4.5834797891036905, "grad_norm": 0.08959573708728479, "learning_rate": 1.6798981239700207e-06, "loss": 0.4656, "step": 2608 }, { "epoch": 4.585237258347979, "grad_norm": 0.0915005881372973, "learning_rate": 1.6658506987995471e-06, "loss": 0.467, "step": 2609 }, { "epoch": 4.586994727592267, "grad_norm": 0.09770626441237516, "learning_rate": 1.6518610041651405e-06, "loss": 0.4687, "step": 2610 }, { "epoch": 4.588752196836555, "grad_norm": 0.09295284502235703, "learning_rate": 1.6379290611350286e-06, "loss": 0.4797, "step": 2611 }, { "epoch": 4.590509666080844, "grad_norm": 0.09106909667318785, "learning_rate": 1.624054890690472e-06, "loss": 0.4749, "step": 2612 }, { "epoch": 4.592267135325132, "grad_norm": 0.09785398194558947, "learning_rate": 1.6102385137257036e-06, "loss": 0.4757, "step": 2613 }, { "epoch": 4.59402460456942, "grad_norm": 0.08946989491389615, "learning_rate": 1.5964799510479557e-06, "loss": 0.4782, "step": 2614 }, { "epoch": 4.595782073813709, "grad_norm": 0.11099639470417733, "learning_rate": 1.5827792233773687e-06, "loss": 0.486, "step": 2615 }, { "epoch": 4.597539543057996, "grad_norm": 0.09499224955784899, "learning_rate": 1.5691363513469894e-06, "loss": 0.4731, "step": 2616 }, { "epoch": 4.5992970123022845, "grad_norm": 0.10080500961902203, "learning_rate": 1.555551355502738e-06, "loss": 0.4695, "step": 2617 }, { "epoch": 4.601054481546573, "grad_norm": 0.09130171657419056, "learning_rate": 1.5420242563033694e-06, "loss": 0.4716, "step": 2618 }, { "epoch": 4.602811950790861, "grad_norm": 0.0947525345681029, "learning_rate": 1.5285550741204546e-06, "loss": 0.4687, "step": 2619 }, { "epoch": 4.6045694200351495, "grad_norm": 0.10145083051334608, "learning_rate": 1.5151438292383413e-06, "loss": 0.4795, "step": 2620 }, { "epoch": 4.606326889279438, "grad_norm": 0.08862940636802111, "learning_rate": 1.5017905418541134e-06, "loss": 0.4732, "step": 2621 }, { "epoch": 4.608084358523726, "grad_norm": 0.09815283091897485, "learning_rate": 1.4884952320775826e-06, "loss": 0.4678, "step": 2622 }, { "epoch": 4.609841827768014, "grad_norm": 0.09092743803935205, "learning_rate": 1.4752579199312477e-06, "loss": 0.4616, "step": 2623 }, { "epoch": 4.611599297012303, "grad_norm": 0.10603500270498353, "learning_rate": 1.4620786253502606e-06, "loss": 0.4772, "step": 2624 }, { "epoch": 4.61335676625659, "grad_norm": 0.08726487714918553, "learning_rate": 1.4489573681824066e-06, "loss": 0.4784, "step": 2625 }, { "epoch": 4.6151142355008785, "grad_norm": 0.08509229403320395, "learning_rate": 1.4358941681880522e-06, "loss": 0.4674, "step": 2626 }, { "epoch": 4.616871704745167, "grad_norm": 0.08322733720184514, "learning_rate": 1.42288904504015e-06, "loss": 0.4818, "step": 2627 }, { "epoch": 4.618629173989455, "grad_norm": 0.09018825320980457, "learning_rate": 1.4099420183241663e-06, "loss": 0.4764, "step": 2628 }, { "epoch": 4.6203866432337435, "grad_norm": 0.08418382088879445, "learning_rate": 1.3970531075381043e-06, "loss": 0.4694, "step": 2629 }, { "epoch": 4.622144112478032, "grad_norm": 0.09728525922761137, "learning_rate": 1.3842223320924286e-06, "loss": 0.4743, "step": 2630 }, { "epoch": 4.62390158172232, "grad_norm": 0.08475500340095703, "learning_rate": 1.3714497113100466e-06, "loss": 0.4742, "step": 2631 }, { "epoch": 4.6256590509666085, "grad_norm": 0.08817250021217013, "learning_rate": 1.3587352644263007e-06, "loss": 0.4695, "step": 2632 }, { "epoch": 4.627416520210897, "grad_norm": 0.08702899253167443, "learning_rate": 1.3460790105889098e-06, "loss": 0.4861, "step": 2633 }, { "epoch": 4.629173989455184, "grad_norm": 0.08008229347105458, "learning_rate": 1.3334809688579785e-06, "loss": 0.471, "step": 2634 }, { "epoch": 4.6309314586994725, "grad_norm": 0.08590556152469714, "learning_rate": 1.3209411582059172e-06, "loss": 0.463, "step": 2635 }, { "epoch": 4.632688927943761, "grad_norm": 0.08968006875631378, "learning_rate": 1.3084595975174598e-06, "loss": 0.4847, "step": 2636 }, { "epoch": 4.634446397188049, "grad_norm": 0.08517505471804333, "learning_rate": 1.2960363055896097e-06, "loss": 0.4769, "step": 2637 }, { "epoch": 4.6362038664323375, "grad_norm": 0.08528647290007285, "learning_rate": 1.2836713011316192e-06, "loss": 0.4763, "step": 2638 }, { "epoch": 4.637961335676626, "grad_norm": 0.08836128469326236, "learning_rate": 1.2713646027649617e-06, "loss": 0.4657, "step": 2639 }, { "epoch": 4.639718804920914, "grad_norm": 0.08970117979061178, "learning_rate": 1.2591162290233049e-06, "loss": 0.4791, "step": 2640 }, { "epoch": 4.6414762741652025, "grad_norm": 0.08328074043955241, "learning_rate": 1.2469261983524805e-06, "loss": 0.4631, "step": 2641 }, { "epoch": 4.643233743409491, "grad_norm": 0.08645984280095541, "learning_rate": 1.2347945291104524e-06, "loss": 0.4618, "step": 2642 }, { "epoch": 4.644991212653778, "grad_norm": 0.08841450088840817, "learning_rate": 1.2227212395672817e-06, "loss": 0.48, "step": 2643 }, { "epoch": 4.646748681898067, "grad_norm": 0.08758431525604754, "learning_rate": 1.210706347905144e-06, "loss": 0.4865, "step": 2644 }, { "epoch": 4.648506151142355, "grad_norm": 0.08552054161968478, "learning_rate": 1.1987498722182411e-06, "loss": 0.4794, "step": 2645 }, { "epoch": 4.650263620386643, "grad_norm": 0.08348561952062945, "learning_rate": 1.1868518305128096e-06, "loss": 0.4932, "step": 2646 }, { "epoch": 4.6520210896309315, "grad_norm": 0.08735616031579511, "learning_rate": 1.1750122407070852e-06, "loss": 0.4773, "step": 2647 }, { "epoch": 4.65377855887522, "grad_norm": 0.08980149631087651, "learning_rate": 1.1632311206312674e-06, "loss": 0.4681, "step": 2648 }, { "epoch": 4.655536028119508, "grad_norm": 0.0848466237460616, "learning_rate": 1.1515084880275286e-06, "loss": 0.4807, "step": 2649 }, { "epoch": 4.6572934973637965, "grad_norm": 0.11782059651949392, "learning_rate": 1.1398443605499288e-06, "loss": 0.4669, "step": 2650 }, { "epoch": 4.659050966608085, "grad_norm": 0.08558863292068446, "learning_rate": 1.1282387557644347e-06, "loss": 0.4701, "step": 2651 }, { "epoch": 4.660808435852372, "grad_norm": 0.0857900861965977, "learning_rate": 1.1166916911488835e-06, "loss": 0.4811, "step": 2652 }, { "epoch": 4.662565905096661, "grad_norm": 0.08564740526760911, "learning_rate": 1.1052031840929379e-06, "loss": 0.4691, "step": 2653 }, { "epoch": 4.664323374340949, "grad_norm": 0.08592183326035295, "learning_rate": 1.0937732518980826e-06, "loss": 0.468, "step": 2654 }, { "epoch": 4.666080843585237, "grad_norm": 0.09473579782351656, "learning_rate": 1.0824019117775975e-06, "loss": 0.4948, "step": 2655 }, { "epoch": 4.6678383128295255, "grad_norm": 0.09136963783470808, "learning_rate": 1.0710891808565038e-06, "loss": 0.4696, "step": 2656 }, { "epoch": 4.669595782073814, "grad_norm": 0.08877682894613446, "learning_rate": 1.0598350761715781e-06, "loss": 0.4779, "step": 2657 }, { "epoch": 4.671353251318102, "grad_norm": 0.0901679969726404, "learning_rate": 1.0486396146712896e-06, "loss": 0.4769, "step": 2658 }, { "epoch": 4.6731107205623905, "grad_norm": 0.08630446620636165, "learning_rate": 1.0375028132158138e-06, "loss": 0.4741, "step": 2659 }, { "epoch": 4.674868189806679, "grad_norm": 0.0855129988854287, "learning_rate": 1.0264246885769658e-06, "loss": 0.4816, "step": 2660 }, { "epoch": 4.676625659050966, "grad_norm": 0.08951268238006704, "learning_rate": 1.0154052574382e-06, "loss": 0.468, "step": 2661 }, { "epoch": 4.678383128295255, "grad_norm": 0.08313931095973245, "learning_rate": 1.0044445363945842e-06, "loss": 0.4733, "step": 2662 }, { "epoch": 4.680140597539543, "grad_norm": 0.08712251762568199, "learning_rate": 9.93542541952759e-07, "loss": 0.4677, "step": 2663 }, { "epoch": 4.681898066783831, "grad_norm": 0.08415101769517182, "learning_rate": 9.826992905309373e-07, "loss": 0.4751, "step": 2664 }, { "epoch": 4.68365553602812, "grad_norm": 0.08924786839984952, "learning_rate": 9.719147984588617e-07, "loss": 0.4769, "step": 2665 }, { "epoch": 4.685413005272408, "grad_norm": 0.08852848094325852, "learning_rate": 9.611890819777758e-07, "loss": 0.4782, "step": 2666 }, { "epoch": 4.687170474516696, "grad_norm": 0.08496981744590544, "learning_rate": 9.505221572404166e-07, "loss": 0.4688, "step": 2667 }, { "epoch": 4.6889279437609845, "grad_norm": 0.08203552134079375, "learning_rate": 9.399140403109785e-07, "loss": 0.4844, "step": 2668 }, { "epoch": 4.690685413005273, "grad_norm": 0.0865969811641692, "learning_rate": 9.293647471651046e-07, "loss": 0.4702, "step": 2669 }, { "epoch": 4.69244288224956, "grad_norm": 0.0874345347416591, "learning_rate": 9.188742936898243e-07, "loss": 0.475, "step": 2670 }, { "epoch": 4.694200351493849, "grad_norm": 0.08198937492036211, "learning_rate": 9.084426956835845e-07, "loss": 0.4729, "step": 2671 }, { "epoch": 4.695957820738137, "grad_norm": 0.08593265440648458, "learning_rate": 8.980699688561745e-07, "loss": 0.4766, "step": 2672 }, { "epoch": 4.697715289982425, "grad_norm": 0.0826172808123062, "learning_rate": 8.87756128828734e-07, "loss": 0.4695, "step": 2673 }, { "epoch": 4.699472759226714, "grad_norm": 0.08832755299993245, "learning_rate": 8.775011911337272e-07, "loss": 0.4768, "step": 2674 }, { "epoch": 4.701230228471002, "grad_norm": 0.08676637593421628, "learning_rate": 8.673051712148984e-07, "loss": 0.4707, "step": 2675 }, { "epoch": 4.70298769771529, "grad_norm": 0.08588705912829672, "learning_rate": 8.571680844272667e-07, "loss": 0.4785, "step": 2676 }, { "epoch": 4.704745166959579, "grad_norm": 0.08726926215915028, "learning_rate": 8.470899460371096e-07, "loss": 0.4824, "step": 2677 }, { "epoch": 4.706502636203867, "grad_norm": 0.08845365863203015, "learning_rate": 8.370707712219084e-07, "loss": 0.4738, "step": 2678 }, { "epoch": 4.708260105448154, "grad_norm": 0.08518310420066211, "learning_rate": 8.271105750703756e-07, "loss": 0.4707, "step": 2679 }, { "epoch": 4.710017574692443, "grad_norm": 0.08410878001982752, "learning_rate": 8.172093725823793e-07, "loss": 0.4802, "step": 2680 }, { "epoch": 4.711775043936731, "grad_norm": 0.08853496657128597, "learning_rate": 8.073671786689518e-07, "loss": 0.4762, "step": 2681 }, { "epoch": 4.713532513181019, "grad_norm": 0.08927506837949424, "learning_rate": 7.975840081522546e-07, "loss": 0.4792, "step": 2682 }, { "epoch": 4.715289982425308, "grad_norm": 0.08255585497160325, "learning_rate": 7.878598757655686e-07, "loss": 0.4732, "step": 2683 }, { "epoch": 4.717047451669596, "grad_norm": 0.08362975378687602, "learning_rate": 7.781947961532688e-07, "loss": 0.4757, "step": 2684 }, { "epoch": 4.718804920913884, "grad_norm": 0.08827501276854981, "learning_rate": 7.685887838707828e-07, "loss": 0.4699, "step": 2685 }, { "epoch": 4.720562390158173, "grad_norm": 0.08421918377484112, "learning_rate": 7.590418533845923e-07, "loss": 0.4739, "step": 2686 }, { "epoch": 4.722319859402461, "grad_norm": 0.08314923094008009, "learning_rate": 7.495540190722051e-07, "loss": 0.4747, "step": 2687 }, { "epoch": 4.724077328646748, "grad_norm": 0.08780158892203319, "learning_rate": 7.401252952221205e-07, "loss": 0.4672, "step": 2688 }, { "epoch": 4.725834797891037, "grad_norm": 0.08829735060636718, "learning_rate": 7.307556960338335e-07, "loss": 0.4823, "step": 2689 }, { "epoch": 4.727592267135325, "grad_norm": 0.08999923954349894, "learning_rate": 7.214452356177903e-07, "loss": 0.4661, "step": 2690 }, { "epoch": 4.729349736379613, "grad_norm": 0.08305795003308893, "learning_rate": 7.121939279953749e-07, "loss": 0.4823, "step": 2691 }, { "epoch": 4.731107205623902, "grad_norm": 0.0840958200982074, "learning_rate": 7.030017870988914e-07, "loss": 0.4862, "step": 2692 }, { "epoch": 4.73286467486819, "grad_norm": 0.08059367758137967, "learning_rate": 6.938688267715288e-07, "loss": 0.4689, "step": 2693 }, { "epoch": 4.734622144112478, "grad_norm": 0.08676065096895773, "learning_rate": 6.847950607673737e-07, "loss": 0.4718, "step": 2694 }, { "epoch": 4.736379613356767, "grad_norm": 0.08375357885746988, "learning_rate": 6.757805027513487e-07, "loss": 0.4718, "step": 2695 }, { "epoch": 4.738137082601055, "grad_norm": 0.08826664624789884, "learning_rate": 6.668251662992075e-07, "loss": 0.474, "step": 2696 }, { "epoch": 4.739894551845342, "grad_norm": 0.08579678014556978, "learning_rate": 6.579290648975401e-07, "loss": 0.4674, "step": 2697 }, { "epoch": 4.741652021089631, "grad_norm": 0.08405479414317242, "learning_rate": 6.490922119437004e-07, "loss": 0.4729, "step": 2698 }, { "epoch": 4.743409490333919, "grad_norm": 0.0829071814829854, "learning_rate": 6.403146207458343e-07, "loss": 0.4933, "step": 2699 }, { "epoch": 4.745166959578207, "grad_norm": 0.08675994076018191, "learning_rate": 6.315963045228391e-07, "loss": 0.4713, "step": 2700 }, { "epoch": 4.746924428822496, "grad_norm": 0.08521252824720542, "learning_rate": 6.229372764043362e-07, "loss": 0.4708, "step": 2701 }, { "epoch": 4.748681898066784, "grad_norm": 0.0865083697348838, "learning_rate": 6.14337549430668e-07, "loss": 0.4742, "step": 2702 }, { "epoch": 4.750439367311072, "grad_norm": 0.0840768506879791, "learning_rate": 6.057971365528659e-07, "loss": 0.4755, "step": 2703 }, { "epoch": 4.752196836555361, "grad_norm": 0.08341364579136731, "learning_rate": 5.97316050632637e-07, "loss": 0.471, "step": 2704 }, { "epoch": 4.753954305799649, "grad_norm": 0.08339107444740396, "learning_rate": 5.888943044423512e-07, "loss": 0.4777, "step": 2705 }, { "epoch": 4.755711775043936, "grad_norm": 0.07981319256186695, "learning_rate": 5.805319106649965e-07, "loss": 0.4729, "step": 2706 }, { "epoch": 4.757469244288225, "grad_norm": 0.08613870867091371, "learning_rate": 5.722288818941923e-07, "loss": 0.4694, "step": 2707 }, { "epoch": 4.759226713532513, "grad_norm": 0.08448760722483499, "learning_rate": 5.639852306341453e-07, "loss": 0.4816, "step": 2708 }, { "epoch": 4.760984182776801, "grad_norm": 0.08254198690281497, "learning_rate": 5.558009692996535e-07, "loss": 0.4704, "step": 2709 }, { "epoch": 4.76274165202109, "grad_norm": 0.08859016212822562, "learning_rate": 5.476761102160577e-07, "loss": 0.4809, "step": 2710 }, { "epoch": 4.764499121265378, "grad_norm": 0.08717999179891987, "learning_rate": 5.396106656192501e-07, "loss": 0.4652, "step": 2711 }, { "epoch": 4.766256590509666, "grad_norm": 0.08336346959771476, "learning_rate": 5.31604647655648e-07, "loss": 0.4737, "step": 2712 }, { "epoch": 4.768014059753955, "grad_norm": 0.0889696378729543, "learning_rate": 5.236580683821669e-07, "loss": 0.4851, "step": 2713 }, { "epoch": 4.769771528998243, "grad_norm": 0.08236601287857687, "learning_rate": 5.157709397662159e-07, "loss": 0.4627, "step": 2714 }, { "epoch": 4.77152899824253, "grad_norm": 0.08388706799636532, "learning_rate": 5.079432736856627e-07, "loss": 0.4748, "step": 2715 }, { "epoch": 4.773286467486819, "grad_norm": 0.08436207042595381, "learning_rate": 5.001750819288376e-07, "loss": 0.4709, "step": 2716 }, { "epoch": 4.775043936731107, "grad_norm": 0.08574128077461371, "learning_rate": 4.924663761944936e-07, "loss": 0.4687, "step": 2717 }, { "epoch": 4.776801405975395, "grad_norm": 0.0882854251332613, "learning_rate": 4.84817168091798e-07, "loss": 0.4666, "step": 2718 }, { "epoch": 4.778558875219684, "grad_norm": 0.0982967425079806, "learning_rate": 4.772274691403222e-07, "loss": 0.4765, "step": 2719 }, { "epoch": 4.780316344463972, "grad_norm": 0.08609593638419515, "learning_rate": 4.6969729077002145e-07, "loss": 0.4794, "step": 2720 }, { "epoch": 4.78207381370826, "grad_norm": 0.08647110025585808, "learning_rate": 4.62226644321202e-07, "loss": 0.4799, "step": 2721 }, { "epoch": 4.783831282952549, "grad_norm": 0.08673494449429522, "learning_rate": 4.5481554104452653e-07, "loss": 0.4743, "step": 2722 }, { "epoch": 4.785588752196837, "grad_norm": 0.08241621809358593, "learning_rate": 4.474639921009738e-07, "loss": 0.4677, "step": 2723 }, { "epoch": 4.787346221441124, "grad_norm": 0.08088170726352849, "learning_rate": 4.401720085618566e-07, "loss": 0.4722, "step": 2724 }, { "epoch": 4.789103690685413, "grad_norm": 0.08575094072399529, "learning_rate": 4.3293960140875946e-07, "loss": 0.4771, "step": 2725 }, { "epoch": 4.790861159929701, "grad_norm": 0.08383130218805404, "learning_rate": 4.2576678153356533e-07, "loss": 0.4808, "step": 2726 }, { "epoch": 4.792618629173989, "grad_norm": 0.08358924369468017, "learning_rate": 4.186535597384067e-07, "loss": 0.4816, "step": 2727 }, { "epoch": 4.794376098418278, "grad_norm": 0.07969472644490669, "learning_rate": 4.115999467356657e-07, "loss": 0.4655, "step": 2728 }, { "epoch": 4.796133567662566, "grad_norm": 0.0842256107174584, "learning_rate": 4.0460595314796067e-07, "loss": 0.4811, "step": 2729 }, { "epoch": 4.797891036906854, "grad_norm": 0.11893700598118541, "learning_rate": 3.976715895081196e-07, "loss": 0.479, "step": 2730 }, { "epoch": 4.799648506151143, "grad_norm": 0.08755452123447369, "learning_rate": 3.907968662591621e-07, "loss": 0.4841, "step": 2731 }, { "epoch": 4.801405975395431, "grad_norm": 0.08859226565432239, "learning_rate": 3.839817937543e-07, "loss": 0.4832, "step": 2732 }, { "epoch": 4.8031634446397184, "grad_norm": 0.08516979420255552, "learning_rate": 3.7722638225691e-07, "loss": 0.4805, "step": 2733 }, { "epoch": 4.804920913884007, "grad_norm": 0.08068214544805563, "learning_rate": 3.705306419405208e-07, "loss": 0.4754, "step": 2734 }, { "epoch": 4.806678383128295, "grad_norm": 0.07942888870036013, "learning_rate": 3.638945828887996e-07, "loss": 0.4732, "step": 2735 }, { "epoch": 4.808435852372583, "grad_norm": 0.0863712273434067, "learning_rate": 3.573182150955257e-07, "loss": 0.4757, "step": 2736 }, { "epoch": 4.810193321616872, "grad_norm": 0.07926217160478959, "learning_rate": 3.508015484645899e-07, "loss": 0.4769, "step": 2737 }, { "epoch": 4.81195079086116, "grad_norm": 0.09213017346344517, "learning_rate": 3.4434459280997754e-07, "loss": 0.4755, "step": 2738 }, { "epoch": 4.813708260105448, "grad_norm": 0.08371177015140557, "learning_rate": 3.379473578557457e-07, "loss": 0.4648, "step": 2739 }, { "epoch": 4.815465729349737, "grad_norm": 0.08147756704255228, "learning_rate": 3.31609853236019e-07, "loss": 0.4798, "step": 2740 }, { "epoch": 4.817223198594025, "grad_norm": 0.08206886624942811, "learning_rate": 3.2533208849495843e-07, "loss": 0.469, "step": 2741 }, { "epoch": 4.8189806678383125, "grad_norm": 0.08551818296127418, "learning_rate": 3.1911407308677033e-07, "loss": 0.4748, "step": 2742 }, { "epoch": 4.820738137082601, "grad_norm": 0.08292007129372923, "learning_rate": 3.1295581637566185e-07, "loss": 0.4804, "step": 2743 }, { "epoch": 4.822495606326889, "grad_norm": 0.0805226520942727, "learning_rate": 3.068573276358677e-07, "loss": 0.4745, "step": 2744 }, { "epoch": 4.824253075571177, "grad_norm": 0.31873344284156574, "learning_rate": 3.008186160516013e-07, "loss": 0.4787, "step": 2745 }, { "epoch": 4.826010544815466, "grad_norm": 0.09835497010556948, "learning_rate": 2.948396907170459e-07, "loss": 0.4867, "step": 2746 }, { "epoch": 4.827768014059754, "grad_norm": 0.08300836077405262, "learning_rate": 2.889205606363632e-07, "loss": 0.4717, "step": 2747 }, { "epoch": 4.829525483304042, "grad_norm": 0.0899735378410569, "learning_rate": 2.8306123472364944e-07, "loss": 0.473, "step": 2748 }, { "epoch": 4.831282952548331, "grad_norm": 0.08633339012313422, "learning_rate": 2.772617218029483e-07, "loss": 0.4712, "step": 2749 }, { "epoch": 4.833040421792619, "grad_norm": 0.08358249778991084, "learning_rate": 2.7152203060822005e-07, "loss": 0.4766, "step": 2750 }, { "epoch": 4.8347978910369065, "grad_norm": 0.08306953731453826, "learning_rate": 2.6584216978333245e-07, "loss": 0.4696, "step": 2751 }, { "epoch": 4.836555360281195, "grad_norm": 0.08425873107003429, "learning_rate": 2.602221478820566e-07, "loss": 0.4736, "step": 2752 }, { "epoch": 4.838312829525483, "grad_norm": 0.08241778165977973, "learning_rate": 2.546619733680489e-07, "loss": 0.4685, "step": 2753 }, { "epoch": 4.8400702987697715, "grad_norm": 0.08377675476230148, "learning_rate": 2.491616546148201e-07, "loss": 0.4707, "step": 2754 }, { "epoch": 4.84182776801406, "grad_norm": 0.08541799137750797, "learning_rate": 2.437211999057576e-07, "loss": 0.4735, "step": 2755 }, { "epoch": 4.843585237258348, "grad_norm": 0.08449222304172499, "learning_rate": 2.3834061743408966e-07, "loss": 0.4753, "step": 2756 }, { "epoch": 4.845342706502636, "grad_norm": 0.08501694077462217, "learning_rate": 2.3301991530287226e-07, "loss": 0.4786, "step": 2757 }, { "epoch": 4.847100175746925, "grad_norm": 0.082908266562115, "learning_rate": 2.2775910152498916e-07, "loss": 0.485, "step": 2758 }, { "epoch": 4.848857644991213, "grad_norm": 0.08188912132800659, "learning_rate": 2.2255818402312946e-07, "loss": 0.4633, "step": 2759 }, { "epoch": 4.8506151142355005, "grad_norm": 0.07948326497854291, "learning_rate": 2.174171706297834e-07, "loss": 0.4658, "step": 2760 }, { "epoch": 4.852372583479789, "grad_norm": 0.08407079603023364, "learning_rate": 2.1233606908722003e-07, "loss": 0.4715, "step": 2761 }, { "epoch": 4.854130052724077, "grad_norm": 0.08350977705054989, "learning_rate": 2.0731488704748725e-07, "loss": 0.4697, "step": 2762 }, { "epoch": 4.8558875219683655, "grad_norm": 0.0901596718062624, "learning_rate": 2.0235363207239845e-07, "loss": 0.4811, "step": 2763 }, { "epoch": 4.857644991212654, "grad_norm": 0.08526381450026649, "learning_rate": 1.9745231163351032e-07, "loss": 0.4873, "step": 2764 }, { "epoch": 4.859402460456942, "grad_norm": 0.08177465821929315, "learning_rate": 1.926109331121273e-07, "loss": 0.4764, "step": 2765 }, { "epoch": 4.8611599297012305, "grad_norm": 0.08337160740963016, "learning_rate": 1.8782950379927056e-07, "loss": 0.4844, "step": 2766 }, { "epoch": 4.862917398945519, "grad_norm": 0.08056643179692909, "learning_rate": 1.8310803089569118e-07, "loss": 0.4801, "step": 2767 }, { "epoch": 4.864674868189807, "grad_norm": 0.08435902578931377, "learning_rate": 1.7844652151183916e-07, "loss": 0.463, "step": 2768 }, { "epoch": 4.8664323374340945, "grad_norm": 0.08188470925828084, "learning_rate": 1.7384498266786787e-07, "loss": 0.4599, "step": 2769 }, { "epoch": 4.868189806678383, "grad_norm": 0.08001644758678403, "learning_rate": 1.6930342129360733e-07, "loss": 0.4721, "step": 2770 }, { "epoch": 4.869947275922671, "grad_norm": 0.0789195517992249, "learning_rate": 1.6482184422857317e-07, "loss": 0.4598, "step": 2771 }, { "epoch": 4.8717047451669595, "grad_norm": 0.09179245257473993, "learning_rate": 1.6040025822193107e-07, "loss": 0.4749, "step": 2772 }, { "epoch": 4.873462214411248, "grad_norm": 0.0876907971784558, "learning_rate": 1.5603866993251448e-07, "loss": 0.4849, "step": 2773 }, { "epoch": 4.875219683655536, "grad_norm": 0.08356914436612944, "learning_rate": 1.5173708592879367e-07, "loss": 0.4691, "step": 2774 }, { "epoch": 4.8769771528998245, "grad_norm": 0.08817824125193642, "learning_rate": 1.4749551268887995e-07, "loss": 0.483, "step": 2775 }, { "epoch": 4.878734622144113, "grad_norm": 0.08378172506893167, "learning_rate": 1.4331395660050372e-07, "loss": 0.4736, "step": 2776 }, { "epoch": 4.880492091388401, "grad_norm": 0.08305434712852688, "learning_rate": 1.391924239610143e-07, "loss": 0.4763, "step": 2777 }, { "epoch": 4.882249560632689, "grad_norm": 0.08341726002915478, "learning_rate": 1.3513092097736656e-07, "loss": 0.4809, "step": 2778 }, { "epoch": 4.884007029876977, "grad_norm": 0.08356927715694477, "learning_rate": 1.3112945376610343e-07, "loss": 0.4707, "step": 2779 }, { "epoch": 4.885764499121265, "grad_norm": 0.08517354004080714, "learning_rate": 1.271880283533733e-07, "loss": 0.482, "step": 2780 }, { "epoch": 4.8875219683655535, "grad_norm": 0.08766021883181878, "learning_rate": 1.2330665067488145e-07, "loss": 0.4712, "step": 2781 }, { "epoch": 4.889279437609842, "grad_norm": 0.08034227530298793, "learning_rate": 1.194853265759166e-07, "loss": 0.4746, "step": 2782 }, { "epoch": 4.89103690685413, "grad_norm": 0.07851251136799736, "learning_rate": 1.1572406181132423e-07, "loss": 0.4612, "step": 2783 }, { "epoch": 4.8927943760984185, "grad_norm": 0.08536628093416994, "learning_rate": 1.120228620455066e-07, "loss": 0.4761, "step": 2784 }, { "epoch": 4.894551845342707, "grad_norm": 0.091466427020515, "learning_rate": 1.0838173285239173e-07, "loss": 0.475, "step": 2785 }, { "epoch": 4.896309314586995, "grad_norm": 0.08069593641335793, "learning_rate": 1.0480067971546437e-07, "loss": 0.4751, "step": 2786 }, { "epoch": 4.898066783831283, "grad_norm": 0.08307996090103671, "learning_rate": 1.0127970802771725e-07, "loss": 0.477, "step": 2787 }, { "epoch": 4.899824253075571, "grad_norm": 0.07850651248652073, "learning_rate": 9.781882309167768e-08, "loss": 0.4889, "step": 2788 }, { "epoch": 4.901581722319859, "grad_norm": 0.0839934599833232, "learning_rate": 9.4418030119372e-08, "loss": 0.4819, "step": 2789 }, { "epoch": 4.9033391915641475, "grad_norm": 0.08269549430433347, "learning_rate": 9.107733423233456e-08, "loss": 0.4847, "step": 2790 }, { "epoch": 4.905096660808436, "grad_norm": 0.08170689335065962, "learning_rate": 8.779674046159426e-08, "loss": 0.4752, "step": 2791 }, { "epoch": 4.906854130052724, "grad_norm": 0.08734066906515324, "learning_rate": 8.457625374766133e-08, "loss": 0.4786, "step": 2792 }, { "epoch": 4.9086115992970125, "grad_norm": 0.08484808796468905, "learning_rate": 8.141587894053615e-08, "loss": 0.4928, "step": 2793 }, { "epoch": 4.910369068541301, "grad_norm": 0.08219418905600914, "learning_rate": 7.831562079968269e-08, "loss": 0.4725, "step": 2794 }, { "epoch": 4.912126537785589, "grad_norm": 0.07912647399041058, "learning_rate": 7.527548399403284e-08, "loss": 0.4632, "step": 2795 }, { "epoch": 4.913884007029877, "grad_norm": 0.08091366005911206, "learning_rate": 7.22954731019776e-08, "loss": 0.4625, "step": 2796 }, { "epoch": 4.915641476274165, "grad_norm": 0.08394720371702812, "learning_rate": 6.937559261136706e-08, "loss": 0.4814, "step": 2797 }, { "epoch": 4.917398945518453, "grad_norm": 0.08042241178527097, "learning_rate": 6.651584691947488e-08, "loss": 0.4691, "step": 2798 }, { "epoch": 4.919156414762742, "grad_norm": 0.08335736370779677, "learning_rate": 6.371624033303825e-08, "loss": 0.4699, "step": 2799 }, { "epoch": 4.92091388400703, "grad_norm": 0.08382827746195899, "learning_rate": 6.097677706820904e-08, "loss": 0.4741, "step": 2800 }, { "epoch": 4.922671353251318, "grad_norm": 0.08126126431608299, "learning_rate": 5.8297461250571563e-08, "loss": 0.4731, "step": 2801 }, { "epoch": 4.9244288224956065, "grad_norm": 0.08446283637073981, "learning_rate": 5.567829691512483e-08, "loss": 0.4718, "step": 2802 }, { "epoch": 4.926186291739895, "grad_norm": 0.08430077692672228, "learning_rate": 5.311928800628252e-08, "loss": 0.4763, "step": 2803 }, { "epoch": 4.927943760984183, "grad_norm": 0.07738786455208732, "learning_rate": 5.0620438377868564e-08, "loss": 0.4757, "step": 2804 }, { "epoch": 4.929701230228471, "grad_norm": 0.07980252071877962, "learning_rate": 4.8181751793103806e-08, "loss": 0.4678, "step": 2805 }, { "epoch": 4.931458699472759, "grad_norm": 0.0864493422785434, "learning_rate": 4.5803231924606005e-08, "loss": 0.4648, "step": 2806 }, { "epoch": 4.933216168717047, "grad_norm": 0.08278016140640754, "learning_rate": 4.3484882354385414e-08, "loss": 0.4664, "step": 2807 }, { "epoch": 4.934973637961336, "grad_norm": 0.08016713409044014, "learning_rate": 4.122670657383143e-08, "loss": 0.4817, "step": 2808 }, { "epoch": 4.936731107205624, "grad_norm": 0.08063882067313331, "learning_rate": 3.902870798371261e-08, "loss": 0.4669, "step": 2809 }, { "epoch": 4.938488576449912, "grad_norm": 0.08590706572745707, "learning_rate": 3.689088989418554e-08, "loss": 0.4864, "step": 2810 }, { "epoch": 4.940246045694201, "grad_norm": 0.08168879284397539, "learning_rate": 3.48132555247549e-08, "loss": 0.4662, "step": 2811 }, { "epoch": 4.942003514938489, "grad_norm": 0.08032291459148733, "learning_rate": 3.2795808004308926e-08, "loss": 0.4833, "step": 2812 }, { "epoch": 4.943760984182777, "grad_norm": 0.08174578751558441, "learning_rate": 3.083855037108396e-08, "loss": 0.4755, "step": 2813 }, { "epoch": 4.945518453427065, "grad_norm": 0.08455584486156896, "learning_rate": 2.8941485572668848e-08, "loss": 0.4701, "step": 2814 }, { "epoch": 4.947275922671353, "grad_norm": 0.08385082940431463, "learning_rate": 2.710461646601825e-08, "loss": 0.4791, "step": 2815 }, { "epoch": 4.949033391915641, "grad_norm": 0.08112667201065245, "learning_rate": 2.5327945817421594e-08, "loss": 0.4646, "step": 2816 }, { "epoch": 4.95079086115993, "grad_norm": 0.0836606291158677, "learning_rate": 2.3611476302507486e-08, "loss": 0.4717, "step": 2817 }, { "epoch": 4.952548330404218, "grad_norm": 0.08353657346002778, "learning_rate": 2.19552105062526e-08, "loss": 0.4796, "step": 2818 }, { "epoch": 4.954305799648506, "grad_norm": 0.08525408264724983, "learning_rate": 2.035915092296392e-08, "loss": 0.4708, "step": 2819 }, { "epoch": 4.956063268892795, "grad_norm": 0.08285629210404061, "learning_rate": 1.88232999562743e-08, "loss": 0.4738, "step": 2820 }, { "epoch": 4.957820738137083, "grad_norm": 0.07991626580502059, "learning_rate": 1.734765991915133e-08, "loss": 0.469, "step": 2821 }, { "epoch": 4.959578207381371, "grad_norm": 0.08012204235159386, "learning_rate": 1.5932233033879586e-08, "loss": 0.4766, "step": 2822 }, { "epoch": 4.961335676625659, "grad_norm": 0.08347245401810766, "learning_rate": 1.4577021432065075e-08, "loss": 0.4876, "step": 2823 }, { "epoch": 4.963093145869947, "grad_norm": 0.07809961952590994, "learning_rate": 1.3282027154639665e-08, "loss": 0.479, "step": 2824 }, { "epoch": 4.964850615114235, "grad_norm": 0.0814170915111165, "learning_rate": 1.2047252151830002e-08, "loss": 0.4715, "step": 2825 }, { "epoch": 4.966608084358524, "grad_norm": 0.08071397826298964, "learning_rate": 1.087269828319748e-08, "loss": 0.4757, "step": 2826 }, { "epoch": 4.968365553602812, "grad_norm": 0.08129898335790794, "learning_rate": 9.75836731758939e-09, "loss": 0.4803, "step": 2827 }, { "epoch": 4.9701230228471, "grad_norm": 0.07830616429463833, "learning_rate": 8.704260933170006e-09, "loss": 0.4757, "step": 2828 }, { "epoch": 4.971880492091389, "grad_norm": 0.08215973074921688, "learning_rate": 7.710380717407261e-09, "loss": 0.4714, "step": 2829 }, { "epoch": 4.973637961335677, "grad_norm": 0.08582670486808129, "learning_rate": 6.776728167063873e-09, "loss": 0.4716, "step": 2830 }, { "epoch": 4.975395430579965, "grad_norm": 0.07832522800156796, "learning_rate": 5.903304688206213e-09, "loss": 0.4767, "step": 2831 }, { "epoch": 4.977152899824253, "grad_norm": 0.08024375967831743, "learning_rate": 5.090111596190994e-09, "loss": 0.4682, "step": 2832 }, { "epoch": 4.978910369068541, "grad_norm": 0.07971191823306172, "learning_rate": 4.33715011567859e-09, "loss": 0.4873, "step": 2833 }, { "epoch": 4.980667838312829, "grad_norm": 0.08197728124451616, "learning_rate": 3.644421380606389e-09, "loss": 0.4719, "step": 2834 }, { "epoch": 4.982425307557118, "grad_norm": 0.08256125252147417, "learning_rate": 3.0119264342198805e-09, "loss": 0.4765, "step": 2835 }, { "epoch": 4.984182776801406, "grad_norm": 0.07978884962890753, "learning_rate": 2.43966622903713e-09, "loss": 0.4739, "step": 2836 }, { "epoch": 4.985940246045694, "grad_norm": 0.0823478555407601, "learning_rate": 1.9276416268798613e-09, "loss": 0.4874, "step": 2837 }, { "epoch": 4.987697715289983, "grad_norm": 0.07860721952909909, "learning_rate": 1.475853398842375e-09, "loss": 0.4804, "step": 2838 }, { "epoch": 4.989455184534271, "grad_norm": 0.0795492227471074, "learning_rate": 1.0843022253093082e-09, "loss": 0.4643, "step": 2839 }, { "epoch": 4.991212653778559, "grad_norm": 0.08121373428838458, "learning_rate": 7.529886959556365e-10, "loss": 0.4774, "step": 2840 }, { "epoch": 4.992970123022847, "grad_norm": 0.07994760483297385, "learning_rate": 4.819133097244688e-10, "loss": 0.4642, "step": 2841 }, { "epoch": 4.994727592267135, "grad_norm": 0.08379468456181956, "learning_rate": 2.7107647485813405e-10, "loss": 0.4761, "step": 2842 }, { "epoch": 4.996485061511423, "grad_norm": 0.08883633883462522, "learning_rate": 1.2047850887153546e-10, "loss": 0.4734, "step": 2843 }, { "epoch": 4.998242530755712, "grad_norm": 0.0809735567075355, "learning_rate": 3.011963855659161e-11, "loss": 0.4757, "step": 2844 }, { "epoch": 5.0, "grad_norm": 0.09984139330744836, "learning_rate": 0.0, "loss": 0.4687, "step": 2845 }, { "epoch": 5.0, "step": 2845, "total_flos": 4.772812472451072e+16, "train_loss": 0.0, "train_runtime": 8.8002, "train_samples_per_second": 165280.93, "train_steps_per_second": 323.289 } ], "logging_steps": 1, "max_steps": 2845, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.772812472451072e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }