diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,54665 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999359426045737, + "eval_steps": 500, + "global_step": 7805, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.7584967613220215, + "learning_rate": 4.2553191489361707e-08, + "loss": 0.7626, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 6.582927227020264, + "learning_rate": 8.510638297872341e-08, + "loss": 0.951, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 8.378379821777344, + "learning_rate": 1.276595744680851e-07, + "loss": 0.9451, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 9.346311569213867, + "learning_rate": 1.7021276595744683e-07, + "loss": 1.0475, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 6.422606945037842, + "learning_rate": 2.1276595744680852e-07, + "loss": 0.8773, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 13.563347816467285, + "learning_rate": 2.553191489361702e-07, + "loss": 0.8474, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 6.2637224197387695, + "learning_rate": 2.9787234042553196e-07, + "loss": 0.9445, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 7.488187313079834, + "learning_rate": 3.4042553191489365e-07, + "loss": 1.0331, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 9.318852424621582, + "learning_rate": 3.8297872340425535e-07, + "loss": 1.0007, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 8.960872650146484, + "learning_rate": 4.2553191489361704e-07, + "loss": 0.9706, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 7.051608562469482, + "learning_rate": 4.6808510638297873e-07, + "loss": 1.0274, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 5.044897556304932, + "learning_rate": 5.106382978723404e-07, + "loss": 0.9102, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 5.335330009460449, + "learning_rate": 5.531914893617021e-07, + "loss": 1.0717, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 5.157832145690918, + "learning_rate": 5.957446808510639e-07, + "loss": 0.9263, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 5.903883934020996, + "learning_rate": 6.382978723404255e-07, + "loss": 1.1029, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 3.430100679397583, + "learning_rate": 6.808510638297873e-07, + "loss": 0.8821, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 4.4080023765563965, + "learning_rate": 7.234042553191489e-07, + "loss": 0.9419, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 3.959429979324341, + "learning_rate": 7.659574468085107e-07, + "loss": 0.922, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 3.123316764831543, + "learning_rate": 8.085106382978725e-07, + "loss": 0.7857, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 3.357064723968506, + "learning_rate": 8.510638297872341e-07, + "loss": 0.8643, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 3.7573935985565186, + "learning_rate": 8.936170212765959e-07, + "loss": 0.9179, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 6.969594478607178, + "learning_rate": 9.361702127659575e-07, + "loss": 0.8299, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 2.6956169605255127, + "learning_rate": 9.787234042553193e-07, + "loss": 0.7627, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 3.103651523590088, + "learning_rate": 1.0212765957446809e-06, + "loss": 0.8709, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 2.5962374210357666, + "learning_rate": 1.0638297872340427e-06, + "loss": 0.6788, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 3.108245849609375, + "learning_rate": 1.1063829787234042e-06, + "loss": 0.8235, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 2.832303762435913, + "learning_rate": 1.148936170212766e-06, + "loss": 0.8102, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 3.666902780532837, + "learning_rate": 1.1914893617021278e-06, + "loss": 0.7508, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 2.241725206375122, + "learning_rate": 1.2340425531914894e-06, + "loss": 0.8194, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 2.413132905960083, + "learning_rate": 1.276595744680851e-06, + "loss": 0.8131, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 1.7967190742492676, + "learning_rate": 1.3191489361702128e-06, + "loss": 0.766, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 1.8691911697387695, + "learning_rate": 1.3617021276595746e-06, + "loss": 0.8, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 1.7689558267593384, + "learning_rate": 1.4042553191489364e-06, + "loss": 0.8018, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 1.7399364709854126, + "learning_rate": 1.4468085106382978e-06, + "loss": 0.5927, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 1.6319252252578735, + "learning_rate": 1.4893617021276596e-06, + "loss": 0.6501, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 1.570933222770691, + "learning_rate": 1.5319148936170214e-06, + "loss": 0.7304, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 1.6395325660705566, + "learning_rate": 1.5744680851063832e-06, + "loss": 0.5934, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 1.7731611728668213, + "learning_rate": 1.617021276595745e-06, + "loss": 0.775, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 1.7520402669906616, + "learning_rate": 1.6595744680851064e-06, + "loss": 0.6107, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 1.782818078994751, + "learning_rate": 1.7021276595744682e-06, + "loss": 0.7435, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 1.6297168731689453, + "learning_rate": 1.74468085106383e-06, + "loss": 0.738, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 1.5183852910995483, + "learning_rate": 1.7872340425531918e-06, + "loss": 0.7191, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 2.116650104522705, + "learning_rate": 1.8297872340425531e-06, + "loss": 0.7662, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 1.5137921571731567, + "learning_rate": 1.872340425531915e-06, + "loss": 0.7391, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 1.1828657388687134, + "learning_rate": 1.9148936170212767e-06, + "loss": 0.7095, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 1.526063084602356, + "learning_rate": 1.9574468085106385e-06, + "loss": 0.7774, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 1.8120944499969482, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6718, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 1.4887276887893677, + "learning_rate": 2.0425531914893617e-06, + "loss": 0.6486, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 1.4536372423171997, + "learning_rate": 2.0851063829787235e-06, + "loss": 0.7203, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 1.6725234985351562, + "learning_rate": 2.1276595744680853e-06, + "loss": 0.6735, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 1.526537537574768, + "learning_rate": 2.170212765957447e-06, + "loss": 0.6525, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 3.7510359287261963, + "learning_rate": 2.2127659574468085e-06, + "loss": 0.5954, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 1.2007803916931152, + "learning_rate": 2.2553191489361703e-06, + "loss": 0.7316, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 1.4945361614227295, + "learning_rate": 2.297872340425532e-06, + "loss": 0.7241, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 1.3967207670211792, + "learning_rate": 2.340425531914894e-06, + "loss": 0.6345, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 1.3292012214660645, + "learning_rate": 2.3829787234042557e-06, + "loss": 0.7134, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 1.5092567205429077, + "learning_rate": 2.425531914893617e-06, + "loss": 0.6689, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 1.9081007242202759, + "learning_rate": 2.468085106382979e-06, + "loss": 0.8589, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 1.3839454650878906, + "learning_rate": 2.5106382978723402e-06, + "loss": 0.6063, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 1.3715382814407349, + "learning_rate": 2.553191489361702e-06, + "loss": 0.5894, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 1.2574067115783691, + "learning_rate": 2.595744680851064e-06, + "loss": 0.6662, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 1.4591063261032104, + "learning_rate": 2.6382978723404256e-06, + "loss": 0.6227, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 2.287207841873169, + "learning_rate": 2.6808510638297874e-06, + "loss": 0.639, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 1.7553712129592896, + "learning_rate": 2.7234042553191492e-06, + "loss": 0.6952, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.2830992937088013, + "learning_rate": 2.765957446808511e-06, + "loss": 0.6323, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.0832946300506592, + "learning_rate": 2.808510638297873e-06, + "loss": 0.6222, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 1.8043638467788696, + "learning_rate": 2.8510638297872346e-06, + "loss": 0.6623, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 1.494361162185669, + "learning_rate": 2.8936170212765956e-06, + "loss": 0.6664, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 1.4487807750701904, + "learning_rate": 2.9361702127659574e-06, + "loss": 0.6772, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 1.7595090866088867, + "learning_rate": 2.978723404255319e-06, + "loss": 0.6911, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 1.4395978450775146, + "learning_rate": 3.021276595744681e-06, + "loss": 0.6153, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 1.4251700639724731, + "learning_rate": 3.0638297872340428e-06, + "loss": 0.6041, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 1.3661823272705078, + "learning_rate": 3.1063829787234046e-06, + "loss": 0.6282, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 1.562219262123108, + "learning_rate": 3.1489361702127664e-06, + "loss": 0.5959, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 2.0947787761688232, + "learning_rate": 3.191489361702128e-06, + "loss": 0.6361, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 2.0909323692321777, + "learning_rate": 3.23404255319149e-06, + "loss": 0.659, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 2.1692888736724854, + "learning_rate": 3.276595744680851e-06, + "loss": 0.6357, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 1.6602727174758911, + "learning_rate": 3.3191489361702127e-06, + "loss": 0.6931, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 1.3999605178833008, + "learning_rate": 3.3617021276595745e-06, + "loss": 0.6471, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 1.3343745470046997, + "learning_rate": 3.4042553191489363e-06, + "loss": 0.6078, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 1.4638773202896118, + "learning_rate": 3.446808510638298e-06, + "loss": 0.5754, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 2.1183791160583496, + "learning_rate": 3.48936170212766e-06, + "loss": 0.6925, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 2.213529348373413, + "learning_rate": 3.5319148936170217e-06, + "loss": 0.6461, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 1.4197733402252197, + "learning_rate": 3.5744680851063835e-06, + "loss": 0.5335, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.487101435661316, + "learning_rate": 3.6170212765957453e-06, + "loss": 0.5241, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 1.4877684116363525, + "learning_rate": 3.6595744680851063e-06, + "loss": 0.7288, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 1.5158016681671143, + "learning_rate": 3.702127659574468e-06, + "loss": 0.7598, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 1.8782508373260498, + "learning_rate": 3.74468085106383e-06, + "loss": 0.5951, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 1.9978234767913818, + "learning_rate": 3.7872340425531917e-06, + "loss": 0.7127, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 5.292247772216797, + "learning_rate": 3.8297872340425535e-06, + "loss": 0.5761, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 1.4210480451583862, + "learning_rate": 3.872340425531915e-06, + "loss": 0.6722, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 1.5236140489578247, + "learning_rate": 3.914893617021277e-06, + "loss": 0.6568, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 1.1957457065582275, + "learning_rate": 3.957446808510639e-06, + "loss": 0.6213, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 1.5347182750701904, + "learning_rate": 4.000000000000001e-06, + "loss": 0.574, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 1.1585361957550049, + "learning_rate": 4.042553191489362e-06, + "loss": 0.5121, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 1.6174169778823853, + "learning_rate": 4.085106382978723e-06, + "loss": 0.6843, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 1.3933274745941162, + "learning_rate": 4.127659574468085e-06, + "loss": 0.7019, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 3.139785051345825, + "learning_rate": 4.170212765957447e-06, + "loss": 0.6227, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 1.218185544013977, + "learning_rate": 4.212765957446809e-06, + "loss": 0.6092, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 1.3104459047317505, + "learning_rate": 4.255319148936171e-06, + "loss": 0.6566, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 1.3607113361358643, + "learning_rate": 4.297872340425532e-06, + "loss": 0.6997, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 1.6485787630081177, + "learning_rate": 4.340425531914894e-06, + "loss": 0.7079, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 3.3833348751068115, + "learning_rate": 4.382978723404256e-06, + "loss": 0.5843, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 1.7810088396072388, + "learning_rate": 4.425531914893617e-06, + "loss": 0.6966, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 1.511935830116272, + "learning_rate": 4.468085106382979e-06, + "loss": 0.6428, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 1.4718260765075684, + "learning_rate": 4.5106382978723406e-06, + "loss": 0.6144, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 1.8542895317077637, + "learning_rate": 4.553191489361702e-06, + "loss": 0.6564, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 1.34366774559021, + "learning_rate": 4.595744680851064e-06, + "loss": 0.5284, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 1.454941987991333, + "learning_rate": 4.638297872340426e-06, + "loss": 0.8061, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 1.738389015197754, + "learning_rate": 4.680851063829788e-06, + "loss": 0.6759, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 1.6170207262039185, + "learning_rate": 4.7234042553191496e-06, + "loss": 0.6429, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 1.628777027130127, + "learning_rate": 4.765957446808511e-06, + "loss": 0.6829, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 1.252131700515747, + "learning_rate": 4.808510638297872e-06, + "loss": 0.5218, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 1.4437956809997559, + "learning_rate": 4.851063829787234e-06, + "loss": 0.667, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 6.339471817016602, + "learning_rate": 4.893617021276596e-06, + "loss": 0.668, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 1.4656023979187012, + "learning_rate": 4.936170212765958e-06, + "loss": 0.6703, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 8.32915210723877, + "learning_rate": 4.9787234042553195e-06, + "loss": 0.6228, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 1.32205069065094, + "learning_rate": 5.0212765957446805e-06, + "loss": 0.5615, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 1.2250367403030396, + "learning_rate": 5.063829787234042e-06, + "loss": 0.6659, + "step": 119 + }, + { + "epoch": 0.02, + "grad_norm": 1.2720946073532104, + "learning_rate": 5.106382978723404e-06, + "loss": 0.5885, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 1.233543872833252, + "learning_rate": 5.148936170212766e-06, + "loss": 0.7094, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 1.4377548694610596, + "learning_rate": 5.191489361702128e-06, + "loss": 0.7314, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 2.2915871143341064, + "learning_rate": 5.2340425531914895e-06, + "loss": 0.6415, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 2.056044101715088, + "learning_rate": 5.276595744680851e-06, + "loss": 0.6211, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 2.617048740386963, + "learning_rate": 5.319148936170213e-06, + "loss": 0.6948, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 1.7886021137237549, + "learning_rate": 5.361702127659575e-06, + "loss": 0.7082, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 1.2329413890838623, + "learning_rate": 5.404255319148937e-06, + "loss": 0.6325, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 1.183271050453186, + "learning_rate": 5.4468085106382985e-06, + "loss": 0.5629, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 1.68038809299469, + "learning_rate": 5.48936170212766e-06, + "loss": 0.6629, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 1.1709634065628052, + "learning_rate": 5.531914893617022e-06, + "loss": 0.7344, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 1.245133638381958, + "learning_rate": 5.574468085106384e-06, + "loss": 0.7115, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 1.9844391345977783, + "learning_rate": 5.617021276595746e-06, + "loss": 0.6059, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 3.47454571723938, + "learning_rate": 5.6595744680851075e-06, + "loss": 0.6277, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 1.5599538087844849, + "learning_rate": 5.702127659574469e-06, + "loss": 0.7082, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 1.6010850667953491, + "learning_rate": 5.744680851063831e-06, + "loss": 0.6496, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 1.419260859489441, + "learning_rate": 5.787234042553191e-06, + "loss": 0.7087, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 1.7564315795898438, + "learning_rate": 5.829787234042553e-06, + "loss": 0.6404, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 1.2922577857971191, + "learning_rate": 5.872340425531915e-06, + "loss": 0.7446, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 1.5558539628982544, + "learning_rate": 5.9148936170212766e-06, + "loss": 0.6687, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 1.5046948194503784, + "learning_rate": 5.957446808510638e-06, + "loss": 0.661, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 1.169022560119629, + "learning_rate": 6e-06, + "loss": 0.6933, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 1.2964138984680176, + "learning_rate": 6.042553191489362e-06, + "loss": 0.621, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 1.2966022491455078, + "learning_rate": 6.085106382978724e-06, + "loss": 0.6333, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 2.7783942222595215, + "learning_rate": 6.1276595744680855e-06, + "loss": 0.6224, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 2.098170280456543, + "learning_rate": 6.170212765957447e-06, + "loss": 0.6892, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 1.532152533531189, + "learning_rate": 6.212765957446809e-06, + "loss": 0.6353, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 2.6122751235961914, + "learning_rate": 6.255319148936171e-06, + "loss": 0.603, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 1.3150262832641602, + "learning_rate": 6.297872340425533e-06, + "loss": 0.68, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 1.8408077955245972, + "learning_rate": 6.3404255319148945e-06, + "loss": 0.7014, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 1.3194700479507446, + "learning_rate": 6.382978723404256e-06, + "loss": 0.6772, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 1.445401668548584, + "learning_rate": 6.425531914893618e-06, + "loss": 0.5895, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 1.475083827972412, + "learning_rate": 6.46808510638298e-06, + "loss": 0.6744, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 3.218212127685547, + "learning_rate": 6.510638297872342e-06, + "loss": 0.6075, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 1.3714830875396729, + "learning_rate": 6.553191489361702e-06, + "loss": 0.6051, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 1.9937951564788818, + "learning_rate": 6.595744680851064e-06, + "loss": 0.7034, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 1.379773497581482, + "learning_rate": 6.6382978723404254e-06, + "loss": 0.6591, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 1.3940249681472778, + "learning_rate": 6.680851063829787e-06, + "loss": 0.6154, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 1.862964391708374, + "learning_rate": 6.723404255319149e-06, + "loss": 0.6576, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 1.3830742835998535, + "learning_rate": 6.765957446808511e-06, + "loss": 0.6628, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 1.437272548675537, + "learning_rate": 6.808510638297873e-06, + "loss": 0.6593, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 1.473165512084961, + "learning_rate": 6.8510638297872344e-06, + "loss": 0.7078, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 1.078347086906433, + "learning_rate": 6.893617021276596e-06, + "loss": 0.5887, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 1.8319272994995117, + "learning_rate": 6.936170212765958e-06, + "loss": 0.6499, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 1.3383046388626099, + "learning_rate": 6.97872340425532e-06, + "loss": 0.7129, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 1.6647669076919556, + "learning_rate": 7.021276595744682e-06, + "loss": 0.5852, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 1.294133186340332, + "learning_rate": 7.0638297872340434e-06, + "loss": 0.6741, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 2.3639767169952393, + "learning_rate": 7.106382978723405e-06, + "loss": 0.5949, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 1.6028077602386475, + "learning_rate": 7.148936170212767e-06, + "loss": 0.653, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 1.474517583847046, + "learning_rate": 7.191489361702129e-06, + "loss": 0.6168, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 1.2433522939682007, + "learning_rate": 7.234042553191491e-06, + "loss": 0.7361, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 1.665745735168457, + "learning_rate": 7.2765957446808524e-06, + "loss": 0.6894, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 1.8916876316070557, + "learning_rate": 7.3191489361702125e-06, + "loss": 0.6372, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 1.5415910482406616, + "learning_rate": 7.361702127659574e-06, + "loss": 0.5954, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 2.2413551807403564, + "learning_rate": 7.404255319148936e-06, + "loss": 0.69, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 1.9613412618637085, + "learning_rate": 7.446808510638298e-06, + "loss": 0.6342, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 1.9772624969482422, + "learning_rate": 7.48936170212766e-06, + "loss": 0.6331, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 1.7678852081298828, + "learning_rate": 7.5319148936170215e-06, + "loss": 0.629, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 2.1412477493286133, + "learning_rate": 7.574468085106383e-06, + "loss": 0.622, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 1.3353233337402344, + "learning_rate": 7.617021276595745e-06, + "loss": 0.6866, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 1.1513804197311401, + "learning_rate": 7.659574468085107e-06, + "loss": 0.6276, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 1.2835663557052612, + "learning_rate": 7.702127659574469e-06, + "loss": 0.6185, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 1.3194074630737305, + "learning_rate": 7.74468085106383e-06, + "loss": 0.6201, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 1.5225788354873657, + "learning_rate": 7.787234042553192e-06, + "loss": 0.5948, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 1.1869926452636719, + "learning_rate": 7.829787234042554e-06, + "loss": 0.5794, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 1.3368916511535645, + "learning_rate": 7.872340425531916e-06, + "loss": 0.7929, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 1.3683775663375854, + "learning_rate": 7.914893617021278e-06, + "loss": 0.7516, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 1.7117352485656738, + "learning_rate": 7.95744680851064e-06, + "loss": 0.6344, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 1.4180941581726074, + "learning_rate": 8.000000000000001e-06, + "loss": 0.6158, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 1.504319667816162, + "learning_rate": 8.042553191489363e-06, + "loss": 0.79, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 1.4916445016860962, + "learning_rate": 8.085106382978723e-06, + "loss": 0.7096, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 1.3161081075668335, + "learning_rate": 8.127659574468085e-06, + "loss": 0.6206, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 1.3922781944274902, + "learning_rate": 8.170212765957447e-06, + "loss": 0.6443, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 1.5196666717529297, + "learning_rate": 8.212765957446809e-06, + "loss": 0.6352, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 1.4567962884902954, + "learning_rate": 8.25531914893617e-06, + "loss": 0.5775, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 1.2705228328704834, + "learning_rate": 8.297872340425532e-06, + "loss": 0.6438, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 1.4779032468795776, + "learning_rate": 8.340425531914894e-06, + "loss": 0.6037, + "step": 196 + }, + { + "epoch": 0.03, + "grad_norm": 1.4720426797866821, + "learning_rate": 8.382978723404256e-06, + "loss": 0.594, + "step": 197 + }, + { + "epoch": 0.03, + "grad_norm": 1.6737343072891235, + "learning_rate": 8.425531914893618e-06, + "loss": 0.6083, + "step": 198 + }, + { + "epoch": 0.03, + "grad_norm": 1.425614356994629, + "learning_rate": 8.46808510638298e-06, + "loss": 0.6219, + "step": 199 + }, + { + "epoch": 0.03, + "grad_norm": 1.4513856172561646, + "learning_rate": 8.510638297872341e-06, + "loss": 0.6389, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 1.7207236289978027, + "learning_rate": 8.553191489361703e-06, + "loss": 0.5883, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 1.8808568716049194, + "learning_rate": 8.595744680851065e-06, + "loss": 0.5775, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 2.1202540397644043, + "learning_rate": 8.638297872340427e-06, + "loss": 0.5386, + "step": 203 + }, + { + "epoch": 0.03, + "grad_norm": 1.2687338590621948, + "learning_rate": 8.680851063829788e-06, + "loss": 0.6925, + "step": 204 + }, + { + "epoch": 0.03, + "grad_norm": 1.3964853286743164, + "learning_rate": 8.72340425531915e-06, + "loss": 0.6858, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 1.5686947107315063, + "learning_rate": 8.765957446808512e-06, + "loss": 0.6917, + "step": 206 + }, + { + "epoch": 0.03, + "grad_norm": 1.251679539680481, + "learning_rate": 8.808510638297874e-06, + "loss": 0.7344, + "step": 207 + }, + { + "epoch": 0.03, + "grad_norm": 3.4568703174591064, + "learning_rate": 8.851063829787234e-06, + "loss": 0.6002, + "step": 208 + }, + { + "epoch": 0.03, + "grad_norm": 1.5010696649551392, + "learning_rate": 8.893617021276596e-06, + "loss": 0.6429, + "step": 209 + }, + { + "epoch": 0.03, + "grad_norm": 1.468229055404663, + "learning_rate": 8.936170212765958e-06, + "loss": 0.5552, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 1.107020378112793, + "learning_rate": 8.97872340425532e-06, + "loss": 0.5931, + "step": 211 + }, + { + "epoch": 0.03, + "grad_norm": 1.6930768489837646, + "learning_rate": 9.021276595744681e-06, + "loss": 0.6574, + "step": 212 + }, + { + "epoch": 0.03, + "grad_norm": 1.752554178237915, + "learning_rate": 9.063829787234043e-06, + "loss": 0.653, + "step": 213 + }, + { + "epoch": 0.03, + "grad_norm": 1.4542021751403809, + "learning_rate": 9.106382978723405e-06, + "loss": 0.6302, + "step": 214 + }, + { + "epoch": 0.03, + "grad_norm": 1.3564616441726685, + "learning_rate": 9.148936170212767e-06, + "loss": 0.6823, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 1.2571592330932617, + "learning_rate": 9.191489361702128e-06, + "loss": 0.659, + "step": 216 + }, + { + "epoch": 0.03, + "grad_norm": 1.4564090967178345, + "learning_rate": 9.23404255319149e-06, + "loss": 0.6032, + "step": 217 + }, + { + "epoch": 0.03, + "grad_norm": 1.5122485160827637, + "learning_rate": 9.276595744680852e-06, + "loss": 0.6931, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 1.4033244848251343, + "learning_rate": 9.319148936170214e-06, + "loss": 0.6148, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 1.2605109214782715, + "learning_rate": 9.361702127659576e-06, + "loss": 0.6233, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 1.4508942365646362, + "learning_rate": 9.404255319148937e-06, + "loss": 0.6114, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 2.8977622985839844, + "learning_rate": 9.446808510638299e-06, + "loss": 0.5908, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 1.4348738193511963, + "learning_rate": 9.489361702127661e-06, + "loss": 0.6485, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 1.2617361545562744, + "learning_rate": 9.531914893617023e-06, + "loss": 0.6773, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 1.2666085958480835, + "learning_rate": 9.574468085106385e-06, + "loss": 0.5397, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 1.5835410356521606, + "learning_rate": 9.617021276595745e-06, + "loss": 0.6741, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 1.776307225227356, + "learning_rate": 9.659574468085106e-06, + "loss": 0.6239, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 1.3855375051498413, + "learning_rate": 9.702127659574468e-06, + "loss": 0.648, + "step": 228 + }, + { + "epoch": 0.03, + "grad_norm": 1.173299789428711, + "learning_rate": 9.74468085106383e-06, + "loss": 0.6081, + "step": 229 + }, + { + "epoch": 0.03, + "grad_norm": 1.3490828275680542, + "learning_rate": 9.787234042553192e-06, + "loss": 0.662, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 1.2873908281326294, + "learning_rate": 9.829787234042554e-06, + "loss": 0.6217, + "step": 231 + }, + { + "epoch": 0.03, + "grad_norm": 1.5484164953231812, + "learning_rate": 9.872340425531915e-06, + "loss": 0.6764, + "step": 232 + }, + { + "epoch": 0.03, + "grad_norm": 1.4617042541503906, + "learning_rate": 9.914893617021277e-06, + "loss": 0.7215, + "step": 233 + }, + { + "epoch": 0.03, + "grad_norm": 1.3531588315963745, + "learning_rate": 9.957446808510639e-06, + "loss": 0.589, + "step": 234 + }, + { + "epoch": 0.03, + "grad_norm": 1.4462039470672607, + "learning_rate": 1e-05, + "loss": 0.6209, + "step": 235 + }, + { + "epoch": 0.03, + "grad_norm": 1.5413349866867065, + "learning_rate": 9.999999569425815e-06, + "loss": 0.6695, + "step": 236 + }, + { + "epoch": 0.03, + "grad_norm": 1.396835207939148, + "learning_rate": 9.999998277703333e-06, + "loss": 0.5872, + "step": 237 + }, + { + "epoch": 0.03, + "grad_norm": 1.3538330793380737, + "learning_rate": 9.999996124832776e-06, + "loss": 0.7435, + "step": 238 + }, + { + "epoch": 0.03, + "grad_norm": 1.7953016757965088, + "learning_rate": 9.999993110814515e-06, + "loss": 0.6416, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 1.1062463521957397, + "learning_rate": 9.999989235649068e-06, + "loss": 0.717, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 1.3287006616592407, + "learning_rate": 9.999984499337105e-06, + "loss": 0.61, + "step": 241 + }, + { + "epoch": 0.03, + "grad_norm": 1.226269006729126, + "learning_rate": 9.99997890187944e-06, + "loss": 0.6009, + "step": 242 + }, + { + "epoch": 0.03, + "grad_norm": 1.6038551330566406, + "learning_rate": 9.99997244327704e-06, + "loss": 0.7504, + "step": 243 + }, + { + "epoch": 0.03, + "grad_norm": 1.3797829151153564, + "learning_rate": 9.999965123531012e-06, + "loss": 0.5251, + "step": 244 + }, + { + "epoch": 0.03, + "grad_norm": 1.4653568267822266, + "learning_rate": 9.999956942642622e-06, + "loss": 0.6416, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 1.0714013576507568, + "learning_rate": 9.999947900613274e-06, + "loss": 0.5643, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 1.486270785331726, + "learning_rate": 9.999937997444528e-06, + "loss": 0.6592, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 1.3784781694412231, + "learning_rate": 9.999927233138092e-06, + "loss": 0.722, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 1.3505719900131226, + "learning_rate": 9.999915607695814e-06, + "loss": 0.6928, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 1.3439764976501465, + "learning_rate": 9.999903121119701e-06, + "loss": 0.5936, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 1.3594400882720947, + "learning_rate": 9.999889773411903e-06, + "loss": 0.6679, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 1.1484427452087402, + "learning_rate": 9.999875564574717e-06, + "loss": 0.6183, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 1.2887957096099854, + "learning_rate": 9.999860494610595e-06, + "loss": 0.5997, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 1.2470365762710571, + "learning_rate": 9.999844563522123e-06, + "loss": 0.6587, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 1.387832522392273, + "learning_rate": 9.999827771312053e-06, + "loss": 0.596, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 1.2869324684143066, + "learning_rate": 9.999810117983275e-06, + "loss": 0.6862, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 1.3541619777679443, + "learning_rate": 9.99979160353883e-06, + "loss": 0.6792, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 1.6047093868255615, + "learning_rate": 9.999772227981905e-06, + "loss": 0.6403, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 1.1892220973968506, + "learning_rate": 9.999751991315838e-06, + "loss": 0.6682, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 1.4554706811904907, + "learning_rate": 9.999730893544115e-06, + "loss": 0.5767, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 1.3690476417541504, + "learning_rate": 9.999708934670366e-06, + "loss": 0.7132, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 1.6959372758865356, + "learning_rate": 9.99968611469838e-06, + "loss": 0.6062, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 1.7080563306808472, + "learning_rate": 9.99966243363208e-06, + "loss": 0.6362, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 1.2266885042190552, + "learning_rate": 9.999637891475549e-06, + "loss": 0.6942, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 1.2976837158203125, + "learning_rate": 9.99961248823301e-06, + "loss": 0.7392, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 1.6553566455841064, + "learning_rate": 9.999586223908845e-06, + "loss": 0.6517, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 1.199306607246399, + "learning_rate": 9.999559098507571e-06, + "loss": 0.6287, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 1.3642069101333618, + "learning_rate": 9.999531112033863e-06, + "loss": 0.5917, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 1.6378116607666016, + "learning_rate": 9.99950226449254e-06, + "loss": 0.6069, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 2.4352381229400635, + "learning_rate": 9.99947255588857e-06, + "loss": 0.6093, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 1.4313955307006836, + "learning_rate": 9.999441986227071e-06, + "loss": 0.5984, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 1.6048554182052612, + "learning_rate": 9.999410555513308e-06, + "loss": 0.6339, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 1.5470483303070068, + "learning_rate": 9.999378263752691e-06, + "loss": 0.6551, + "step": 273 + }, + { + "epoch": 0.04, + "grad_norm": 1.2593071460723877, + "learning_rate": 9.999345110950787e-06, + "loss": 0.6564, + "step": 274 + }, + { + "epoch": 0.04, + "grad_norm": 1.4662338495254517, + "learning_rate": 9.999311097113303e-06, + "loss": 0.503, + "step": 275 + }, + { + "epoch": 0.04, + "grad_norm": 1.6436269283294678, + "learning_rate": 9.999276222246094e-06, + "loss": 0.7033, + "step": 276 + }, + { + "epoch": 0.04, + "grad_norm": 2.233489990234375, + "learning_rate": 9.999240486355173e-06, + "loss": 0.6238, + "step": 277 + }, + { + "epoch": 0.04, + "grad_norm": 1.1347347497940063, + "learning_rate": 9.999203889446691e-06, + "loss": 0.6618, + "step": 278 + }, + { + "epoch": 0.04, + "grad_norm": 1.7216888666152954, + "learning_rate": 9.999166431526952e-06, + "loss": 0.5455, + "step": 279 + }, + { + "epoch": 0.04, + "grad_norm": 1.1763736009597778, + "learning_rate": 9.999128112602406e-06, + "loss": 0.5321, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 1.6581827402114868, + "learning_rate": 9.999088932679653e-06, + "loss": 0.6222, + "step": 281 + }, + { + "epoch": 0.04, + "grad_norm": 1.3219795227050781, + "learning_rate": 9.999048891765443e-06, + "loss": 0.601, + "step": 282 + }, + { + "epoch": 0.04, + "grad_norm": 1.4004895687103271, + "learning_rate": 9.999007989866671e-06, + "loss": 0.6141, + "step": 283 + }, + { + "epoch": 0.04, + "grad_norm": 2.341827630996704, + "learning_rate": 9.99896622699038e-06, + "loss": 0.6072, + "step": 284 + }, + { + "epoch": 0.04, + "grad_norm": 1.2262059450149536, + "learning_rate": 9.998923603143767e-06, + "loss": 0.7096, + "step": 285 + }, + { + "epoch": 0.04, + "grad_norm": 1.5562580823898315, + "learning_rate": 9.998880118334167e-06, + "loss": 0.6483, + "step": 286 + }, + { + "epoch": 0.04, + "grad_norm": 1.8266093730926514, + "learning_rate": 9.998835772569075e-06, + "loss": 0.5435, + "step": 287 + }, + { + "epoch": 0.04, + "grad_norm": 1.291782259941101, + "learning_rate": 9.998790565856124e-06, + "loss": 0.5632, + "step": 288 + }, + { + "epoch": 0.04, + "grad_norm": 1.3557615280151367, + "learning_rate": 9.998744498203104e-06, + "loss": 0.7585, + "step": 289 + }, + { + "epoch": 0.04, + "grad_norm": 1.248962640762329, + "learning_rate": 9.998697569617947e-06, + "loss": 0.6625, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 1.591097116470337, + "learning_rate": 9.998649780108737e-06, + "loss": 0.6722, + "step": 291 + }, + { + "epoch": 0.04, + "grad_norm": 1.3779689073562622, + "learning_rate": 9.998601129683703e-06, + "loss": 0.6342, + "step": 292 + }, + { + "epoch": 0.04, + "grad_norm": 2.6783649921417236, + "learning_rate": 9.998551618351225e-06, + "loss": 0.5614, + "step": 293 + }, + { + "epoch": 0.04, + "grad_norm": 1.789115071296692, + "learning_rate": 9.998501246119828e-06, + "loss": 0.6905, + "step": 294 + }, + { + "epoch": 0.04, + "grad_norm": 1.309683084487915, + "learning_rate": 9.998450012998192e-06, + "loss": 0.619, + "step": 295 + }, + { + "epoch": 0.04, + "grad_norm": 1.9988428354263306, + "learning_rate": 9.998397918995138e-06, + "loss": 0.5937, + "step": 296 + }, + { + "epoch": 0.04, + "grad_norm": 1.2818232774734497, + "learning_rate": 9.998344964119639e-06, + "loss": 0.6503, + "step": 297 + }, + { + "epoch": 0.04, + "grad_norm": 1.3331990242004395, + "learning_rate": 9.998291148380813e-06, + "loss": 0.7047, + "step": 298 + }, + { + "epoch": 0.04, + "grad_norm": 1.3238475322723389, + "learning_rate": 9.998236471787933e-06, + "loss": 0.758, + "step": 299 + }, + { + "epoch": 0.04, + "grad_norm": 2.516418218612671, + "learning_rate": 9.998180934350413e-06, + "loss": 0.6936, + "step": 300 + }, + { + "epoch": 0.04, + "grad_norm": 1.0435734987258911, + "learning_rate": 9.998124536077819e-06, + "loss": 0.6652, + "step": 301 + }, + { + "epoch": 0.04, + "grad_norm": 1.999271035194397, + "learning_rate": 9.998067276979863e-06, + "loss": 0.639, + "step": 302 + }, + { + "epoch": 0.04, + "grad_norm": 1.2757498025894165, + "learning_rate": 9.99800915706641e-06, + "loss": 0.5991, + "step": 303 + }, + { + "epoch": 0.04, + "grad_norm": 1.3763235807418823, + "learning_rate": 9.997950176347469e-06, + "loss": 0.6986, + "step": 304 + }, + { + "epoch": 0.04, + "grad_norm": 1.3771135807037354, + "learning_rate": 9.997890334833195e-06, + "loss": 0.7234, + "step": 305 + }, + { + "epoch": 0.04, + "grad_norm": 1.662296175956726, + "learning_rate": 9.997829632533897e-06, + "loss": 0.7195, + "step": 306 + }, + { + "epoch": 0.04, + "grad_norm": 1.1534143686294556, + "learning_rate": 9.99776806946003e-06, + "loss": 0.6387, + "step": 307 + }, + { + "epoch": 0.04, + "grad_norm": 1.804293155670166, + "learning_rate": 9.997705645622195e-06, + "loss": 0.6533, + "step": 308 + }, + { + "epoch": 0.04, + "grad_norm": 1.2307173013687134, + "learning_rate": 9.997642361031147e-06, + "loss": 0.6438, + "step": 309 + }, + { + "epoch": 0.04, + "grad_norm": 1.4131581783294678, + "learning_rate": 9.997578215697782e-06, + "loss": 0.5973, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 1.2833446264266968, + "learning_rate": 9.997513209633149e-06, + "loss": 0.6023, + "step": 311 + }, + { + "epoch": 0.04, + "grad_norm": 1.623981237411499, + "learning_rate": 9.997447342848443e-06, + "loss": 0.604, + "step": 312 + }, + { + "epoch": 0.04, + "grad_norm": 1.3072586059570312, + "learning_rate": 9.99738061535501e-06, + "loss": 0.6669, + "step": 313 + }, + { + "epoch": 0.04, + "grad_norm": 1.3638581037521362, + "learning_rate": 9.997313027164342e-06, + "loss": 0.7149, + "step": 314 + }, + { + "epoch": 0.04, + "grad_norm": 1.2896875143051147, + "learning_rate": 9.997244578288079e-06, + "loss": 0.6452, + "step": 315 + }, + { + "epoch": 0.04, + "grad_norm": 1.1254551410675049, + "learning_rate": 9.99717526873801e-06, + "loss": 0.6556, + "step": 316 + }, + { + "epoch": 0.04, + "grad_norm": 3.2857415676116943, + "learning_rate": 9.997105098526073e-06, + "loss": 0.6838, + "step": 317 + }, + { + "epoch": 0.04, + "grad_norm": 2.3155784606933594, + "learning_rate": 9.997034067664352e-06, + "loss": 0.6333, + "step": 318 + }, + { + "epoch": 0.04, + "grad_norm": 1.4330530166625977, + "learning_rate": 9.996962176165081e-06, + "loss": 0.6069, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 1.828263282775879, + "learning_rate": 9.996889424040644e-06, + "loss": 0.6589, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 1.317088007926941, + "learning_rate": 9.996815811303566e-06, + "loss": 0.6511, + "step": 321 + }, + { + "epoch": 0.04, + "grad_norm": 2.6714189052581787, + "learning_rate": 9.996741337966531e-06, + "loss": 0.6292, + "step": 322 + }, + { + "epoch": 0.04, + "grad_norm": 1.569022297859192, + "learning_rate": 9.996666004042364e-06, + "loss": 0.6129, + "step": 323 + }, + { + "epoch": 0.04, + "grad_norm": 1.6436303853988647, + "learning_rate": 9.996589809544036e-06, + "loss": 0.6159, + "step": 324 + }, + { + "epoch": 0.04, + "grad_norm": 1.2534624338150024, + "learning_rate": 9.996512754484675e-06, + "loss": 0.6133, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 2.129281759262085, + "learning_rate": 9.996434838877549e-06, + "loss": 0.6453, + "step": 326 + }, + { + "epoch": 0.04, + "grad_norm": 1.673301100730896, + "learning_rate": 9.996356062736077e-06, + "loss": 0.6298, + "step": 327 + }, + { + "epoch": 0.04, + "grad_norm": 1.3405145406723022, + "learning_rate": 9.99627642607383e-06, + "loss": 0.6878, + "step": 328 + }, + { + "epoch": 0.04, + "grad_norm": 1.0584392547607422, + "learning_rate": 9.996195928904522e-06, + "loss": 0.6118, + "step": 329 + }, + { + "epoch": 0.04, + "grad_norm": 1.4414960145950317, + "learning_rate": 9.996114571242015e-06, + "loss": 0.5883, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 1.4856294393539429, + "learning_rate": 9.996032353100324e-06, + "loss": 0.5985, + "step": 331 + }, + { + "epoch": 0.04, + "grad_norm": 1.4282467365264893, + "learning_rate": 9.995949274493608e-06, + "loss": 0.6427, + "step": 332 + }, + { + "epoch": 0.04, + "grad_norm": 1.0834569931030273, + "learning_rate": 9.995865335436177e-06, + "loss": 0.7503, + "step": 333 + }, + { + "epoch": 0.04, + "grad_norm": 1.4855166673660278, + "learning_rate": 9.995780535942485e-06, + "loss": 0.6853, + "step": 334 + }, + { + "epoch": 0.04, + "grad_norm": 2.0462682247161865, + "learning_rate": 9.99569487602714e-06, + "loss": 0.6613, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 1.3587714433670044, + "learning_rate": 9.995608355704893e-06, + "loss": 0.6956, + "step": 336 + }, + { + "epoch": 0.04, + "grad_norm": 2.249725103378296, + "learning_rate": 9.995520974990646e-06, + "loss": 0.6199, + "step": 337 + }, + { + "epoch": 0.04, + "grad_norm": 1.6236398220062256, + "learning_rate": 9.99543273389945e-06, + "loss": 0.6971, + "step": 338 + }, + { + "epoch": 0.04, + "grad_norm": 1.2508240938186646, + "learning_rate": 9.995343632446501e-06, + "loss": 0.7062, + "step": 339 + }, + { + "epoch": 0.04, + "grad_norm": 1.3989686965942383, + "learning_rate": 9.995253670647146e-06, + "loss": 0.7234, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 1.2701517343521118, + "learning_rate": 9.995162848516878e-06, + "loss": 0.6243, + "step": 341 + }, + { + "epoch": 0.04, + "grad_norm": 2.829824924468994, + "learning_rate": 9.995071166071339e-06, + "loss": 0.575, + "step": 342 + }, + { + "epoch": 0.04, + "grad_norm": 1.5379947423934937, + "learning_rate": 9.994978623326321e-06, + "loss": 0.6044, + "step": 343 + }, + { + "epoch": 0.04, + "grad_norm": 1.1864334344863892, + "learning_rate": 9.994885220297763e-06, + "loss": 0.6259, + "step": 344 + }, + { + "epoch": 0.04, + "grad_norm": 1.7398015260696411, + "learning_rate": 9.994790957001748e-06, + "loss": 0.7088, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 1.7878941297531128, + "learning_rate": 9.994695833454515e-06, + "loss": 0.6905, + "step": 346 + }, + { + "epoch": 0.04, + "grad_norm": 1.316868543624878, + "learning_rate": 9.994599849672446e-06, + "loss": 0.6109, + "step": 347 + }, + { + "epoch": 0.04, + "grad_norm": 1.9868359565734863, + "learning_rate": 9.994503005672072e-06, + "loss": 0.6019, + "step": 348 + }, + { + "epoch": 0.04, + "grad_norm": 1.3597651720046997, + "learning_rate": 9.994405301470072e-06, + "loss": 0.5773, + "step": 349 + }, + { + "epoch": 0.04, + "grad_norm": 1.5507285594940186, + "learning_rate": 9.994306737083275e-06, + "loss": 0.6289, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 1.1398341655731201, + "learning_rate": 9.994207312528655e-06, + "loss": 0.6136, + "step": 351 + }, + { + "epoch": 0.05, + "grad_norm": 1.3331668376922607, + "learning_rate": 9.994107027823334e-06, + "loss": 0.6668, + "step": 352 + }, + { + "epoch": 0.05, + "grad_norm": 1.4200987815856934, + "learning_rate": 9.994005882984588e-06, + "loss": 0.6294, + "step": 353 + }, + { + "epoch": 0.05, + "grad_norm": 1.577136754989624, + "learning_rate": 9.993903878029838e-06, + "loss": 0.607, + "step": 354 + }, + { + "epoch": 0.05, + "grad_norm": 1.2228367328643799, + "learning_rate": 9.993801012976647e-06, + "loss": 0.6131, + "step": 355 + }, + { + "epoch": 0.05, + "grad_norm": 1.4739247560501099, + "learning_rate": 9.993697287842735e-06, + "loss": 0.666, + "step": 356 + }, + { + "epoch": 0.05, + "grad_norm": 1.5883527994155884, + "learning_rate": 9.993592702645966e-06, + "loss": 0.7499, + "step": 357 + }, + { + "epoch": 0.05, + "grad_norm": 1.7352406978607178, + "learning_rate": 9.993487257404352e-06, + "loss": 0.6809, + "step": 358 + }, + { + "epoch": 0.05, + "grad_norm": 1.7410889863967896, + "learning_rate": 9.993380952136057e-06, + "loss": 0.6538, + "step": 359 + }, + { + "epoch": 0.05, + "grad_norm": 1.5513193607330322, + "learning_rate": 9.993273786859384e-06, + "loss": 0.63, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 1.3114519119262695, + "learning_rate": 9.993165761592795e-06, + "loss": 0.7388, + "step": 361 + }, + { + "epoch": 0.05, + "grad_norm": 1.255658745765686, + "learning_rate": 9.993056876354892e-06, + "loss": 0.6261, + "step": 362 + }, + { + "epoch": 0.05, + "grad_norm": 1.130070686340332, + "learning_rate": 9.992947131164432e-06, + "loss": 0.6732, + "step": 363 + }, + { + "epoch": 0.05, + "grad_norm": 1.0119282007217407, + "learning_rate": 9.992836526040312e-06, + "loss": 0.5841, + "step": 364 + }, + { + "epoch": 0.05, + "grad_norm": 1.1341041326522827, + "learning_rate": 9.992725061001585e-06, + "loss": 0.6315, + "step": 365 + }, + { + "epoch": 0.05, + "grad_norm": 1.1530150175094604, + "learning_rate": 9.992612736067446e-06, + "loss": 0.5805, + "step": 366 + }, + { + "epoch": 0.05, + "grad_norm": 1.2198214530944824, + "learning_rate": 9.992499551257243e-06, + "loss": 0.6518, + "step": 367 + }, + { + "epoch": 0.05, + "grad_norm": 1.5688096284866333, + "learning_rate": 9.992385506590467e-06, + "loss": 0.6451, + "step": 368 + }, + { + "epoch": 0.05, + "grad_norm": 1.202008605003357, + "learning_rate": 9.992270602086764e-06, + "loss": 0.7313, + "step": 369 + }, + { + "epoch": 0.05, + "grad_norm": 1.660534143447876, + "learning_rate": 9.992154837765919e-06, + "loss": 0.658, + "step": 370 + }, + { + "epoch": 0.05, + "grad_norm": 1.2800931930541992, + "learning_rate": 9.992038213647875e-06, + "loss": 0.6472, + "step": 371 + }, + { + "epoch": 0.05, + "grad_norm": 1.4618778228759766, + "learning_rate": 9.991920729752713e-06, + "loss": 0.6719, + "step": 372 + }, + { + "epoch": 0.05, + "grad_norm": 1.3455302715301514, + "learning_rate": 9.991802386100672e-06, + "loss": 0.638, + "step": 373 + }, + { + "epoch": 0.05, + "grad_norm": 1.2282071113586426, + "learning_rate": 9.99168318271213e-06, + "loss": 0.7115, + "step": 374 + }, + { + "epoch": 0.05, + "grad_norm": 1.1178747415542603, + "learning_rate": 9.991563119607622e-06, + "loss": 0.5796, + "step": 375 + }, + { + "epoch": 0.05, + "grad_norm": 1.1349022388458252, + "learning_rate": 9.991442196807823e-06, + "loss": 0.5702, + "step": 376 + }, + { + "epoch": 0.05, + "grad_norm": 1.206775426864624, + "learning_rate": 9.991320414333559e-06, + "loss": 0.6191, + "step": 377 + }, + { + "epoch": 0.05, + "grad_norm": 2.0808825492858887, + "learning_rate": 9.991197772205808e-06, + "loss": 0.6128, + "step": 378 + }, + { + "epoch": 0.05, + "grad_norm": 1.1514902114868164, + "learning_rate": 9.99107427044569e-06, + "loss": 0.7401, + "step": 379 + }, + { + "epoch": 0.05, + "grad_norm": 1.1866756677627563, + "learning_rate": 9.990949909074477e-06, + "loss": 0.6648, + "step": 380 + }, + { + "epoch": 0.05, + "grad_norm": 1.040029764175415, + "learning_rate": 9.990824688113584e-06, + "loss": 0.6372, + "step": 381 + }, + { + "epoch": 0.05, + "grad_norm": 1.2645063400268555, + "learning_rate": 9.990698607584584e-06, + "loss": 0.6317, + "step": 382 + }, + { + "epoch": 0.05, + "grad_norm": 1.032130479812622, + "learning_rate": 9.990571667509187e-06, + "loss": 0.6034, + "step": 383 + }, + { + "epoch": 0.05, + "grad_norm": 1.203916072845459, + "learning_rate": 9.990443867909258e-06, + "loss": 0.6927, + "step": 384 + }, + { + "epoch": 0.05, + "grad_norm": 1.2320196628570557, + "learning_rate": 9.990315208806807e-06, + "loss": 0.6196, + "step": 385 + }, + { + "epoch": 0.05, + "grad_norm": 1.374750018119812, + "learning_rate": 9.990185690223995e-06, + "loss": 0.6235, + "step": 386 + }, + { + "epoch": 0.05, + "grad_norm": 1.7351794242858887, + "learning_rate": 9.990055312183125e-06, + "loss": 0.5726, + "step": 387 + }, + { + "epoch": 0.05, + "grad_norm": 1.2510026693344116, + "learning_rate": 9.989924074706654e-06, + "loss": 0.7773, + "step": 388 + }, + { + "epoch": 0.05, + "grad_norm": 1.2288150787353516, + "learning_rate": 9.989791977817187e-06, + "loss": 0.6208, + "step": 389 + }, + { + "epoch": 0.05, + "grad_norm": 1.2345741987228394, + "learning_rate": 9.989659021537471e-06, + "loss": 0.6605, + "step": 390 + }, + { + "epoch": 0.05, + "grad_norm": 1.257174015045166, + "learning_rate": 9.989525205890407e-06, + "loss": 0.6108, + "step": 391 + }, + { + "epoch": 0.05, + "grad_norm": 1.563079595565796, + "learning_rate": 9.989390530899044e-06, + "loss": 0.5989, + "step": 392 + }, + { + "epoch": 0.05, + "grad_norm": 1.1823904514312744, + "learning_rate": 9.989254996586575e-06, + "loss": 0.5835, + "step": 393 + }, + { + "epoch": 0.05, + "grad_norm": 1.3369593620300293, + "learning_rate": 9.989118602976343e-06, + "loss": 0.6985, + "step": 394 + }, + { + "epoch": 0.05, + "grad_norm": 2.9907262325286865, + "learning_rate": 9.98898135009184e-06, + "loss": 0.6928, + "step": 395 + }, + { + "epoch": 0.05, + "grad_norm": 4.113388538360596, + "learning_rate": 9.988843237956703e-06, + "loss": 0.5624, + "step": 396 + }, + { + "epoch": 0.05, + "grad_norm": 1.2897573709487915, + "learning_rate": 9.98870426659472e-06, + "loss": 0.6339, + "step": 397 + }, + { + "epoch": 0.05, + "grad_norm": 1.5871529579162598, + "learning_rate": 9.988564436029826e-06, + "loss": 0.627, + "step": 398 + }, + { + "epoch": 0.05, + "grad_norm": 1.316611409187317, + "learning_rate": 9.988423746286105e-06, + "loss": 0.6233, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 1.2401022911071777, + "learning_rate": 9.988282197387787e-06, + "loss": 0.7542, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 1.3629307746887207, + "learning_rate": 9.988139789359252e-06, + "loss": 0.6359, + "step": 401 + }, + { + "epoch": 0.05, + "grad_norm": 1.324845552444458, + "learning_rate": 9.987996522225025e-06, + "loss": 0.7213, + "step": 402 + }, + { + "epoch": 0.05, + "grad_norm": 2.809295177459717, + "learning_rate": 9.987852396009783e-06, + "loss": 0.634, + "step": 403 + }, + { + "epoch": 0.05, + "grad_norm": 1.5875015258789062, + "learning_rate": 9.987707410738346e-06, + "loss": 0.6653, + "step": 404 + }, + { + "epoch": 0.05, + "grad_norm": 1.2210239171981812, + "learning_rate": 9.987561566435688e-06, + "loss": 0.7018, + "step": 405 + }, + { + "epoch": 0.05, + "grad_norm": 1.1161646842956543, + "learning_rate": 9.987414863126926e-06, + "loss": 0.768, + "step": 406 + }, + { + "epoch": 0.05, + "grad_norm": 1.0861924886703491, + "learning_rate": 9.987267300837327e-06, + "loss": 0.5714, + "step": 407 + }, + { + "epoch": 0.05, + "grad_norm": 1.3647254705429077, + "learning_rate": 9.987118879592303e-06, + "loss": 0.7023, + "step": 408 + }, + { + "epoch": 0.05, + "grad_norm": 1.4136096239089966, + "learning_rate": 9.986969599417422e-06, + "loss": 0.6671, + "step": 409 + }, + { + "epoch": 0.05, + "grad_norm": 1.715958595275879, + "learning_rate": 9.98681946033839e-06, + "loss": 0.6588, + "step": 410 + }, + { + "epoch": 0.05, + "grad_norm": 1.387445330619812, + "learning_rate": 9.986668462381065e-06, + "loss": 0.7723, + "step": 411 + }, + { + "epoch": 0.05, + "grad_norm": 1.0558326244354248, + "learning_rate": 9.98651660557146e-06, + "loss": 0.6056, + "step": 412 + }, + { + "epoch": 0.05, + "grad_norm": 1.917008638381958, + "learning_rate": 9.98636388993572e-06, + "loss": 0.6206, + "step": 413 + }, + { + "epoch": 0.05, + "grad_norm": 1.2889763116836548, + "learning_rate": 9.986210315500154e-06, + "loss": 0.7506, + "step": 414 + }, + { + "epoch": 0.05, + "grad_norm": 1.4434642791748047, + "learning_rate": 9.986055882291208e-06, + "loss": 0.5823, + "step": 415 + }, + { + "epoch": 0.05, + "grad_norm": 1.9407073259353638, + "learning_rate": 9.985900590335483e-06, + "loss": 0.7348, + "step": 416 + }, + { + "epoch": 0.05, + "grad_norm": 1.259628415107727, + "learning_rate": 9.985744439659724e-06, + "loss": 0.5983, + "step": 417 + }, + { + "epoch": 0.05, + "grad_norm": 2.4348812103271484, + "learning_rate": 9.985587430290822e-06, + "loss": 0.6106, + "step": 418 + }, + { + "epoch": 0.05, + "grad_norm": 1.2677141427993774, + "learning_rate": 9.985429562255822e-06, + "loss": 0.6385, + "step": 419 + }, + { + "epoch": 0.05, + "grad_norm": 1.1897544860839844, + "learning_rate": 9.985270835581914e-06, + "loss": 0.7339, + "step": 420 + }, + { + "epoch": 0.05, + "grad_norm": 1.1162315607070923, + "learning_rate": 9.985111250296434e-06, + "loss": 0.7, + "step": 421 + }, + { + "epoch": 0.05, + "grad_norm": 1.4913219213485718, + "learning_rate": 9.984950806426865e-06, + "loss": 0.5671, + "step": 422 + }, + { + "epoch": 0.05, + "grad_norm": 1.8393346071243286, + "learning_rate": 9.984789504000844e-06, + "loss": 0.7219, + "step": 423 + }, + { + "epoch": 0.05, + "grad_norm": 1.3586616516113281, + "learning_rate": 9.98462734304615e-06, + "loss": 0.6634, + "step": 424 + }, + { + "epoch": 0.05, + "grad_norm": 1.41817307472229, + "learning_rate": 9.984464323590712e-06, + "loss": 0.5488, + "step": 425 + }, + { + "epoch": 0.05, + "grad_norm": 1.3771775960922241, + "learning_rate": 9.984300445662608e-06, + "loss": 0.64, + "step": 426 + }, + { + "epoch": 0.05, + "grad_norm": 1.3530077934265137, + "learning_rate": 9.984135709290061e-06, + "loss": 0.5721, + "step": 427 + }, + { + "epoch": 0.05, + "grad_norm": 1.2935279607772827, + "learning_rate": 9.983970114501447e-06, + "loss": 0.7433, + "step": 428 + }, + { + "epoch": 0.05, + "grad_norm": 1.2527360916137695, + "learning_rate": 9.983803661325282e-06, + "loss": 0.6697, + "step": 429 + }, + { + "epoch": 0.06, + "grad_norm": 1.7714976072311401, + "learning_rate": 9.983636349790235e-06, + "loss": 0.626, + "step": 430 + }, + { + "epoch": 0.06, + "grad_norm": 1.2465953826904297, + "learning_rate": 9.983468179925124e-06, + "loss": 0.6589, + "step": 431 + }, + { + "epoch": 0.06, + "grad_norm": 1.5122458934783936, + "learning_rate": 9.983299151758912e-06, + "loss": 0.5727, + "step": 432 + }, + { + "epoch": 0.06, + "grad_norm": 1.4289777278900146, + "learning_rate": 9.98312926532071e-06, + "loss": 0.6082, + "step": 433 + }, + { + "epoch": 0.06, + "grad_norm": 1.2806988954544067, + "learning_rate": 9.982958520639778e-06, + "loss": 0.6652, + "step": 434 + }, + { + "epoch": 0.06, + "grad_norm": 1.4954502582550049, + "learning_rate": 9.982786917745523e-06, + "loss": 0.6851, + "step": 435 + }, + { + "epoch": 0.06, + "grad_norm": 2.040862560272217, + "learning_rate": 9.982614456667502e-06, + "loss": 0.6674, + "step": 436 + }, + { + "epoch": 0.06, + "grad_norm": 1.2818940877914429, + "learning_rate": 9.982441137435414e-06, + "loss": 0.6397, + "step": 437 + }, + { + "epoch": 0.06, + "grad_norm": 1.2709026336669922, + "learning_rate": 9.982266960079113e-06, + "loss": 0.6375, + "step": 438 + }, + { + "epoch": 0.06, + "grad_norm": 1.2646985054016113, + "learning_rate": 9.982091924628596e-06, + "loss": 0.7385, + "step": 439 + }, + { + "epoch": 0.06, + "grad_norm": 1.0162150859832764, + "learning_rate": 9.981916031114011e-06, + "loss": 0.5205, + "step": 440 + }, + { + "epoch": 0.06, + "grad_norm": 1.4943784475326538, + "learning_rate": 9.98173927956565e-06, + "loss": 0.6367, + "step": 441 + }, + { + "epoch": 0.06, + "grad_norm": 1.2825793027877808, + "learning_rate": 9.981561670013955e-06, + "loss": 0.6305, + "step": 442 + }, + { + "epoch": 0.06, + "grad_norm": 1.1641693115234375, + "learning_rate": 9.981383202489517e-06, + "loss": 0.6917, + "step": 443 + }, + { + "epoch": 0.06, + "grad_norm": 1.4706101417541504, + "learning_rate": 9.981203877023074e-06, + "loss": 0.6046, + "step": 444 + }, + { + "epoch": 0.06, + "grad_norm": 1.171685814857483, + "learning_rate": 9.981023693645509e-06, + "loss": 0.5857, + "step": 445 + }, + { + "epoch": 0.06, + "grad_norm": 1.3791475296020508, + "learning_rate": 9.980842652387855e-06, + "loss": 0.542, + "step": 446 + }, + { + "epoch": 0.06, + "grad_norm": 1.1576347351074219, + "learning_rate": 9.980660753281296e-06, + "loss": 0.6192, + "step": 447 + }, + { + "epoch": 0.06, + "grad_norm": 1.4999419450759888, + "learning_rate": 9.980477996357154e-06, + "loss": 0.6426, + "step": 448 + }, + { + "epoch": 0.06, + "grad_norm": 1.2739951610565186, + "learning_rate": 9.980294381646912e-06, + "loss": 0.6035, + "step": 449 + }, + { + "epoch": 0.06, + "grad_norm": 1.450174331665039, + "learning_rate": 9.98010990918219e-06, + "loss": 0.667, + "step": 450 + }, + { + "epoch": 0.06, + "grad_norm": 1.3379954099655151, + "learning_rate": 9.979924578994761e-06, + "loss": 0.6993, + "step": 451 + }, + { + "epoch": 0.06, + "grad_norm": 1.5418367385864258, + "learning_rate": 9.979738391116543e-06, + "loss": 0.5721, + "step": 452 + }, + { + "epoch": 0.06, + "grad_norm": 1.351285696029663, + "learning_rate": 9.979551345579606e-06, + "loss": 0.6084, + "step": 453 + }, + { + "epoch": 0.06, + "grad_norm": 1.3503434658050537, + "learning_rate": 9.97936344241616e-06, + "loss": 0.7051, + "step": 454 + }, + { + "epoch": 0.06, + "grad_norm": 1.6836271286010742, + "learning_rate": 9.979174681658575e-06, + "loss": 0.6324, + "step": 455 + }, + { + "epoch": 0.06, + "grad_norm": 1.482871174812317, + "learning_rate": 9.978985063339353e-06, + "loss": 0.634, + "step": 456 + }, + { + "epoch": 0.06, + "grad_norm": 1.6858736276626587, + "learning_rate": 9.978794587491156e-06, + "loss": 0.6554, + "step": 457 + }, + { + "epoch": 0.06, + "grad_norm": 1.5458368062973022, + "learning_rate": 9.97860325414679e-06, + "loss": 0.599, + "step": 458 + }, + { + "epoch": 0.06, + "grad_norm": 1.42569899559021, + "learning_rate": 9.978411063339205e-06, + "loss": 0.6472, + "step": 459 + }, + { + "epoch": 0.06, + "grad_norm": 1.1632460355758667, + "learning_rate": 9.978218015101508e-06, + "loss": 0.651, + "step": 460 + }, + { + "epoch": 0.06, + "grad_norm": 1.2948800325393677, + "learning_rate": 9.978024109466942e-06, + "loss": 0.5811, + "step": 461 + }, + { + "epoch": 0.06, + "grad_norm": 2.991281509399414, + "learning_rate": 9.977829346468906e-06, + "loss": 0.6956, + "step": 462 + }, + { + "epoch": 0.06, + "grad_norm": 1.260887622833252, + "learning_rate": 9.977633726140942e-06, + "loss": 0.6101, + "step": 463 + }, + { + "epoch": 0.06, + "grad_norm": 1.1584416627883911, + "learning_rate": 9.977437248516744e-06, + "loss": 0.5652, + "step": 464 + }, + { + "epoch": 0.06, + "grad_norm": 1.210117220878601, + "learning_rate": 9.977239913630149e-06, + "loss": 0.7002, + "step": 465 + }, + { + "epoch": 0.06, + "grad_norm": 1.4283682107925415, + "learning_rate": 9.977041721515146e-06, + "loss": 0.6748, + "step": 466 + }, + { + "epoch": 0.06, + "grad_norm": 1.113726258277893, + "learning_rate": 9.976842672205868e-06, + "loss": 0.6123, + "step": 467 + }, + { + "epoch": 0.06, + "grad_norm": 1.1056057214736938, + "learning_rate": 9.976642765736597e-06, + "loss": 0.6603, + "step": 468 + }, + { + "epoch": 0.06, + "grad_norm": 1.267026424407959, + "learning_rate": 9.976442002141767e-06, + "loss": 0.6771, + "step": 469 + }, + { + "epoch": 0.06, + "grad_norm": 1.5912866592407227, + "learning_rate": 9.976240381455949e-06, + "loss": 0.6103, + "step": 470 + }, + { + "epoch": 0.06, + "grad_norm": 1.5797346830368042, + "learning_rate": 9.976037903713872e-06, + "loss": 0.6114, + "step": 471 + }, + { + "epoch": 0.06, + "grad_norm": 1.223134160041809, + "learning_rate": 9.975834568950406e-06, + "loss": 0.6158, + "step": 472 + }, + { + "epoch": 0.06, + "grad_norm": 1.211577296257019, + "learning_rate": 9.975630377200575e-06, + "loss": 0.6709, + "step": 473 + }, + { + "epoch": 0.06, + "grad_norm": 1.0716959238052368, + "learning_rate": 9.975425328499546e-06, + "loss": 0.6498, + "step": 474 + }, + { + "epoch": 0.06, + "grad_norm": 1.189151644706726, + "learning_rate": 9.975219422882631e-06, + "loss": 0.6125, + "step": 475 + }, + { + "epoch": 0.06, + "grad_norm": 1.2349804639816284, + "learning_rate": 9.975012660385298e-06, + "loss": 0.6524, + "step": 476 + }, + { + "epoch": 0.06, + "grad_norm": 1.7489399909973145, + "learning_rate": 9.974805041043154e-06, + "loss": 0.7575, + "step": 477 + }, + { + "epoch": 0.06, + "grad_norm": 1.7448736429214478, + "learning_rate": 9.974596564891958e-06, + "loss": 0.6242, + "step": 478 + }, + { + "epoch": 0.06, + "grad_norm": 1.4200482368469238, + "learning_rate": 9.974387231967618e-06, + "loss": 0.7357, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 1.3394062519073486, + "learning_rate": 9.974177042306184e-06, + "loss": 0.7014, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 1.8164784908294678, + "learning_rate": 9.973965995943857e-06, + "loss": 0.745, + "step": 481 + }, + { + "epoch": 0.06, + "grad_norm": 1.347047209739685, + "learning_rate": 9.973754092916989e-06, + "loss": 0.7665, + "step": 482 + }, + { + "epoch": 0.06, + "grad_norm": 1.2569183111190796, + "learning_rate": 9.973541333262073e-06, + "loss": 0.604, + "step": 483 + }, + { + "epoch": 0.06, + "grad_norm": 1.4116610288619995, + "learning_rate": 9.973327717015753e-06, + "loss": 0.6642, + "step": 484 + }, + { + "epoch": 0.06, + "grad_norm": 1.4871090650558472, + "learning_rate": 9.97311324421482e-06, + "loss": 0.6648, + "step": 485 + }, + { + "epoch": 0.06, + "grad_norm": 1.491804599761963, + "learning_rate": 9.972897914896215e-06, + "loss": 0.7223, + "step": 486 + }, + { + "epoch": 0.06, + "grad_norm": 1.3295527696609497, + "learning_rate": 9.97268172909702e-06, + "loss": 0.6121, + "step": 487 + }, + { + "epoch": 0.06, + "grad_norm": 1.0643301010131836, + "learning_rate": 9.972464686854473e-06, + "loss": 0.6677, + "step": 488 + }, + { + "epoch": 0.06, + "grad_norm": 1.3908393383026123, + "learning_rate": 9.97224678820595e-06, + "loss": 0.616, + "step": 489 + }, + { + "epoch": 0.06, + "grad_norm": 1.06988525390625, + "learning_rate": 9.972028033188984e-06, + "loss": 0.6551, + "step": 490 + }, + { + "epoch": 0.06, + "grad_norm": 1.0374271869659424, + "learning_rate": 9.97180842184125e-06, + "loss": 0.7074, + "step": 491 + }, + { + "epoch": 0.06, + "grad_norm": 1.0564719438552856, + "learning_rate": 9.97158795420057e-06, + "loss": 0.6798, + "step": 492 + }, + { + "epoch": 0.06, + "grad_norm": 1.26225745677948, + "learning_rate": 9.971366630304917e-06, + "loss": 0.6457, + "step": 493 + }, + { + "epoch": 0.06, + "grad_norm": 1.0807775259017944, + "learning_rate": 9.971144450192408e-06, + "loss": 0.588, + "step": 494 + }, + { + "epoch": 0.06, + "grad_norm": 1.1355947256088257, + "learning_rate": 9.97092141390131e-06, + "loss": 0.7856, + "step": 495 + }, + { + "epoch": 0.06, + "grad_norm": 1.3302438259124756, + "learning_rate": 9.970697521470036e-06, + "loss": 0.614, + "step": 496 + }, + { + "epoch": 0.06, + "grad_norm": 1.4943525791168213, + "learning_rate": 9.970472772937148e-06, + "loss": 0.6594, + "step": 497 + }, + { + "epoch": 0.06, + "grad_norm": 1.7916415929794312, + "learning_rate": 9.970247168341352e-06, + "loss": 0.68, + "step": 498 + }, + { + "epoch": 0.06, + "grad_norm": 1.2984795570373535, + "learning_rate": 9.970020707721505e-06, + "loss": 0.676, + "step": 499 + }, + { + "epoch": 0.06, + "grad_norm": 1.271872878074646, + "learning_rate": 9.969793391116613e-06, + "loss": 0.7291, + "step": 500 + }, + { + "epoch": 0.06, + "grad_norm": 1.3915470838546753, + "learning_rate": 9.969565218565823e-06, + "loss": 0.6427, + "step": 501 + }, + { + "epoch": 0.06, + "grad_norm": 1.4289871454238892, + "learning_rate": 9.969336190108434e-06, + "loss": 0.683, + "step": 502 + }, + { + "epoch": 0.06, + "grad_norm": 1.4833756685256958, + "learning_rate": 9.969106305783894e-06, + "loss": 0.6101, + "step": 503 + }, + { + "epoch": 0.06, + "grad_norm": 1.1719868183135986, + "learning_rate": 9.96887556563179e-06, + "loss": 0.611, + "step": 504 + }, + { + "epoch": 0.06, + "grad_norm": 1.2994431257247925, + "learning_rate": 9.968643969691869e-06, + "loss": 0.6425, + "step": 505 + }, + { + "epoch": 0.06, + "grad_norm": 1.4861727952957153, + "learning_rate": 9.968411518004013e-06, + "loss": 0.6118, + "step": 506 + }, + { + "epoch": 0.06, + "grad_norm": 1.2328635454177856, + "learning_rate": 9.968178210608261e-06, + "loss": 0.5606, + "step": 507 + }, + { + "epoch": 0.07, + "grad_norm": 1.6703741550445557, + "learning_rate": 9.967944047544795e-06, + "loss": 0.6405, + "step": 508 + }, + { + "epoch": 0.07, + "grad_norm": 1.5043929815292358, + "learning_rate": 9.967709028853943e-06, + "loss": 0.6459, + "step": 509 + }, + { + "epoch": 0.07, + "grad_norm": 1.377638339996338, + "learning_rate": 9.967473154576185e-06, + "loss": 0.6761, + "step": 510 + }, + { + "epoch": 0.07, + "grad_norm": 1.2160335779190063, + "learning_rate": 9.96723642475214e-06, + "loss": 0.627, + "step": 511 + }, + { + "epoch": 0.07, + "grad_norm": 1.4559895992279053, + "learning_rate": 9.966998839422587e-06, + "loss": 0.6755, + "step": 512 + }, + { + "epoch": 0.07, + "grad_norm": 1.4201894998550415, + "learning_rate": 9.966760398628441e-06, + "loss": 0.6708, + "step": 513 + }, + { + "epoch": 0.07, + "grad_norm": 1.3131368160247803, + "learning_rate": 9.966521102410769e-06, + "loss": 0.6067, + "step": 514 + }, + { + "epoch": 0.07, + "grad_norm": 1.503257393836975, + "learning_rate": 9.966280950810787e-06, + "loss": 0.6305, + "step": 515 + }, + { + "epoch": 0.07, + "grad_norm": 1.0093584060668945, + "learning_rate": 9.966039943869853e-06, + "loss": 0.5931, + "step": 516 + }, + { + "epoch": 0.07, + "grad_norm": 1.2581803798675537, + "learning_rate": 9.965798081629478e-06, + "loss": 0.5784, + "step": 517 + }, + { + "epoch": 0.07, + "grad_norm": 1.0922319889068604, + "learning_rate": 9.965555364131316e-06, + "loss": 0.6432, + "step": 518 + }, + { + "epoch": 0.07, + "grad_norm": 1.2308220863342285, + "learning_rate": 9.965311791417173e-06, + "loss": 0.6207, + "step": 519 + }, + { + "epoch": 0.07, + "grad_norm": 2.4796323776245117, + "learning_rate": 9.965067363528996e-06, + "loss": 0.697, + "step": 520 + }, + { + "epoch": 0.07, + "grad_norm": 3.5409436225891113, + "learning_rate": 9.964822080508884e-06, + "loss": 0.5826, + "step": 521 + }, + { + "epoch": 0.07, + "grad_norm": 2.550527811050415, + "learning_rate": 9.964575942399085e-06, + "loss": 0.6782, + "step": 522 + }, + { + "epoch": 0.07, + "grad_norm": 1.4845041036605835, + "learning_rate": 9.964328949241987e-06, + "loss": 0.6793, + "step": 523 + }, + { + "epoch": 0.07, + "grad_norm": 1.7661019563674927, + "learning_rate": 9.964081101080133e-06, + "loss": 0.6368, + "step": 524 + }, + { + "epoch": 0.07, + "grad_norm": 1.430008053779602, + "learning_rate": 9.963832397956206e-06, + "loss": 0.6865, + "step": 525 + }, + { + "epoch": 0.07, + "grad_norm": 1.7027018070220947, + "learning_rate": 9.963582839913042e-06, + "loss": 0.6343, + "step": 526 + }, + { + "epoch": 0.07, + "grad_norm": 1.6025042533874512, + "learning_rate": 9.963332426993623e-06, + "loss": 0.6292, + "step": 527 + }, + { + "epoch": 0.07, + "grad_norm": 1.252992033958435, + "learning_rate": 9.963081159241077e-06, + "loss": 0.682, + "step": 528 + }, + { + "epoch": 0.07, + "grad_norm": 1.218106746673584, + "learning_rate": 9.96282903669868e-06, + "loss": 0.5634, + "step": 529 + }, + { + "epoch": 0.07, + "grad_norm": 1.6767704486846924, + "learning_rate": 9.962576059409854e-06, + "loss": 0.6234, + "step": 530 + }, + { + "epoch": 0.07, + "grad_norm": 1.111166000366211, + "learning_rate": 9.96232222741817e-06, + "loss": 0.6472, + "step": 531 + }, + { + "epoch": 0.07, + "grad_norm": 1.2849923372268677, + "learning_rate": 9.962067540767345e-06, + "loss": 0.5941, + "step": 532 + }, + { + "epoch": 0.07, + "grad_norm": 5.2195000648498535, + "learning_rate": 9.961811999501245e-06, + "loss": 0.5667, + "step": 533 + }, + { + "epoch": 0.07, + "grad_norm": 1.106995701789856, + "learning_rate": 9.96155560366388e-06, + "loss": 0.6523, + "step": 534 + }, + { + "epoch": 0.07, + "grad_norm": 2.545809268951416, + "learning_rate": 9.96129835329941e-06, + "loss": 0.5382, + "step": 535 + }, + { + "epoch": 0.07, + "grad_norm": 1.297607183456421, + "learning_rate": 9.96104024845214e-06, + "loss": 0.545, + "step": 536 + }, + { + "epoch": 0.07, + "grad_norm": 1.8389314413070679, + "learning_rate": 9.960781289166524e-06, + "loss": 0.7501, + "step": 537 + }, + { + "epoch": 0.07, + "grad_norm": 1.588292121887207, + "learning_rate": 9.960521475487164e-06, + "loss": 0.5668, + "step": 538 + }, + { + "epoch": 0.07, + "grad_norm": 1.2533057928085327, + "learning_rate": 9.960260807458806e-06, + "loss": 0.6534, + "step": 539 + }, + { + "epoch": 0.07, + "grad_norm": 1.1930246353149414, + "learning_rate": 9.959999285126344e-06, + "loss": 0.615, + "step": 540 + }, + { + "epoch": 0.07, + "grad_norm": 1.2591867446899414, + "learning_rate": 9.959736908534821e-06, + "loss": 0.676, + "step": 541 + }, + { + "epoch": 0.07, + "grad_norm": 1.2490293979644775, + "learning_rate": 9.959473677729426e-06, + "loss": 0.7228, + "step": 542 + }, + { + "epoch": 0.07, + "grad_norm": 0.9905379414558411, + "learning_rate": 9.959209592755495e-06, + "loss": 0.6434, + "step": 543 + }, + { + "epoch": 0.07, + "grad_norm": 1.484781265258789, + "learning_rate": 9.958944653658513e-06, + "loss": 0.7403, + "step": 544 + }, + { + "epoch": 0.07, + "grad_norm": 1.3250967264175415, + "learning_rate": 9.958678860484106e-06, + "loss": 0.7106, + "step": 545 + }, + { + "epoch": 0.07, + "grad_norm": 1.2482023239135742, + "learning_rate": 9.958412213278057e-06, + "loss": 0.6753, + "step": 546 + }, + { + "epoch": 0.07, + "grad_norm": 1.0016635656356812, + "learning_rate": 9.958144712086286e-06, + "loss": 0.6783, + "step": 547 + }, + { + "epoch": 0.07, + "grad_norm": 1.143125057220459, + "learning_rate": 9.957876356954867e-06, + "loss": 0.7045, + "step": 548 + }, + { + "epoch": 0.07, + "grad_norm": 1.036476969718933, + "learning_rate": 9.957607147930017e-06, + "loss": 0.6173, + "step": 549 + }, + { + "epoch": 0.07, + "grad_norm": 1.2345410585403442, + "learning_rate": 9.957337085058103e-06, + "loss": 0.6915, + "step": 550 + }, + { + "epoch": 0.07, + "grad_norm": 1.4582602977752686, + "learning_rate": 9.957066168385639e-06, + "loss": 0.6074, + "step": 551 + }, + { + "epoch": 0.07, + "grad_norm": 1.2883025407791138, + "learning_rate": 9.956794397959283e-06, + "loss": 0.6289, + "step": 552 + }, + { + "epoch": 0.07, + "grad_norm": 1.3238660097122192, + "learning_rate": 9.956521773825842e-06, + "loss": 0.6089, + "step": 553 + }, + { + "epoch": 0.07, + "grad_norm": 1.081383228302002, + "learning_rate": 9.956248296032271e-06, + "loss": 0.6748, + "step": 554 + }, + { + "epoch": 0.07, + "grad_norm": 1.2733744382858276, + "learning_rate": 9.955973964625671e-06, + "loss": 0.6841, + "step": 555 + }, + { + "epoch": 0.07, + "grad_norm": 1.1720517873764038, + "learning_rate": 9.95569877965329e-06, + "loss": 0.6472, + "step": 556 + }, + { + "epoch": 0.07, + "grad_norm": 1.179827094078064, + "learning_rate": 9.955422741162521e-06, + "loss": 0.6, + "step": 557 + }, + { + "epoch": 0.07, + "grad_norm": 1.3065040111541748, + "learning_rate": 9.95514584920091e-06, + "loss": 0.6316, + "step": 558 + }, + { + "epoch": 0.07, + "grad_norm": 1.4125103950500488, + "learning_rate": 9.95486810381614e-06, + "loss": 0.6008, + "step": 559 + }, + { + "epoch": 0.07, + "grad_norm": 1.5217939615249634, + "learning_rate": 9.954589505056054e-06, + "loss": 0.704, + "step": 560 + }, + { + "epoch": 0.07, + "grad_norm": 1.4721038341522217, + "learning_rate": 9.95431005296863e-06, + "loss": 0.6193, + "step": 561 + }, + { + "epoch": 0.07, + "grad_norm": 1.0634781122207642, + "learning_rate": 9.954029747601999e-06, + "loss": 0.5784, + "step": 562 + }, + { + "epoch": 0.07, + "grad_norm": 1.4376946687698364, + "learning_rate": 9.95374858900444e-06, + "loss": 0.6244, + "step": 563 + }, + { + "epoch": 0.07, + "grad_norm": 1.3898831605911255, + "learning_rate": 9.953466577224374e-06, + "loss": 0.6728, + "step": 564 + }, + { + "epoch": 0.07, + "grad_norm": 1.651649832725525, + "learning_rate": 9.953183712310373e-06, + "loss": 0.5602, + "step": 565 + }, + { + "epoch": 0.07, + "grad_norm": 2.1175930500030518, + "learning_rate": 9.952899994311153e-06, + "loss": 0.6814, + "step": 566 + }, + { + "epoch": 0.07, + "grad_norm": 1.2130919694900513, + "learning_rate": 9.952615423275584e-06, + "loss": 0.6513, + "step": 567 + }, + { + "epoch": 0.07, + "grad_norm": 1.3236795663833618, + "learning_rate": 9.952329999252673e-06, + "loss": 0.602, + "step": 568 + }, + { + "epoch": 0.07, + "grad_norm": 2.570998191833496, + "learning_rate": 9.952043722291577e-06, + "loss": 0.7349, + "step": 569 + }, + { + "epoch": 0.07, + "grad_norm": 1.0845404863357544, + "learning_rate": 9.951756592441606e-06, + "loss": 0.6471, + "step": 570 + }, + { + "epoch": 0.07, + "grad_norm": 1.2199392318725586, + "learning_rate": 9.95146860975221e-06, + "loss": 0.6956, + "step": 571 + }, + { + "epoch": 0.07, + "grad_norm": 1.1437824964523315, + "learning_rate": 9.951179774272988e-06, + "loss": 0.5707, + "step": 572 + }, + { + "epoch": 0.07, + "grad_norm": 1.277314305305481, + "learning_rate": 9.950890086053686e-06, + "loss": 0.6485, + "step": 573 + }, + { + "epoch": 0.07, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.9505995451442e-06, + "loss": 0.574, + "step": 574 + }, + { + "epoch": 0.07, + "grad_norm": 1.1564849615097046, + "learning_rate": 9.950308151594563e-06, + "loss": 0.595, + "step": 575 + }, + { + "epoch": 0.07, + "grad_norm": 1.2639933824539185, + "learning_rate": 9.950015905454969e-06, + "loss": 0.712, + "step": 576 + }, + { + "epoch": 0.07, + "grad_norm": 2.110800266265869, + "learning_rate": 9.949722806775746e-06, + "loss": 0.6254, + "step": 577 + }, + { + "epoch": 0.07, + "grad_norm": 1.1644377708435059, + "learning_rate": 9.949428855607377e-06, + "loss": 0.6832, + "step": 578 + }, + { + "epoch": 0.07, + "grad_norm": 1.162166953086853, + "learning_rate": 9.949134052000488e-06, + "loss": 0.742, + "step": 579 + }, + { + "epoch": 0.07, + "grad_norm": 1.1934207677841187, + "learning_rate": 9.948838396005854e-06, + "loss": 0.7117, + "step": 580 + }, + { + "epoch": 0.07, + "grad_norm": 1.3645777702331543, + "learning_rate": 9.948541887674396e-06, + "loss": 0.6821, + "step": 581 + }, + { + "epoch": 0.07, + "grad_norm": 1.4982502460479736, + "learning_rate": 9.94824452705718e-06, + "loss": 0.654, + "step": 582 + }, + { + "epoch": 0.07, + "grad_norm": 4.82792329788208, + "learning_rate": 9.947946314205421e-06, + "loss": 0.6821, + "step": 583 + }, + { + "epoch": 0.07, + "grad_norm": 1.4962670803070068, + "learning_rate": 9.94764724917048e-06, + "loss": 0.6436, + "step": 584 + }, + { + "epoch": 0.07, + "grad_norm": 1.179075837135315, + "learning_rate": 9.947347332003865e-06, + "loss": 0.6445, + "step": 585 + }, + { + "epoch": 0.08, + "grad_norm": 1.9640148878097534, + "learning_rate": 9.94704656275723e-06, + "loss": 0.6306, + "step": 586 + }, + { + "epoch": 0.08, + "grad_norm": 1.0042561292648315, + "learning_rate": 9.946744941482379e-06, + "loss": 0.713, + "step": 587 + }, + { + "epoch": 0.08, + "grad_norm": 1.8437522649765015, + "learning_rate": 9.946442468231257e-06, + "loss": 0.7496, + "step": 588 + }, + { + "epoch": 0.08, + "grad_norm": 1.3848373889923096, + "learning_rate": 9.94613914305596e-06, + "loss": 0.6736, + "step": 589 + }, + { + "epoch": 0.08, + "grad_norm": 1.5635427236557007, + "learning_rate": 9.94583496600873e-06, + "loss": 0.6535, + "step": 590 + }, + { + "epoch": 0.08, + "grad_norm": 1.3483617305755615, + "learning_rate": 9.945529937141953e-06, + "loss": 0.616, + "step": 591 + }, + { + "epoch": 0.08, + "grad_norm": 1.2669429779052734, + "learning_rate": 9.945224056508168e-06, + "loss": 0.6904, + "step": 592 + }, + { + "epoch": 0.08, + "grad_norm": 1.7648189067840576, + "learning_rate": 9.944917324160055e-06, + "loss": 0.6421, + "step": 593 + }, + { + "epoch": 0.08, + "grad_norm": 1.4274234771728516, + "learning_rate": 9.944609740150441e-06, + "loss": 0.6098, + "step": 594 + }, + { + "epoch": 0.08, + "grad_norm": 1.2724729776382446, + "learning_rate": 9.944301304532302e-06, + "loss": 0.5873, + "step": 595 + }, + { + "epoch": 0.08, + "grad_norm": 1.306372880935669, + "learning_rate": 9.943992017358761e-06, + "loss": 0.6709, + "step": 596 + }, + { + "epoch": 0.08, + "grad_norm": 1.3919581174850464, + "learning_rate": 9.943681878683085e-06, + "loss": 0.6482, + "step": 597 + }, + { + "epoch": 0.08, + "grad_norm": 1.333129644393921, + "learning_rate": 9.94337088855869e-06, + "loss": 0.7064, + "step": 598 + }, + { + "epoch": 0.08, + "grad_norm": 1.256874680519104, + "learning_rate": 9.943059047039137e-06, + "loss": 0.6856, + "step": 599 + }, + { + "epoch": 0.08, + "grad_norm": 1.1219730377197266, + "learning_rate": 9.942746354178133e-06, + "loss": 0.6434, + "step": 600 + }, + { + "epoch": 0.08, + "grad_norm": 1.3746181726455688, + "learning_rate": 9.942432810029536e-06, + "loss": 0.6124, + "step": 601 + }, + { + "epoch": 0.08, + "grad_norm": 1.2980557680130005, + "learning_rate": 9.942118414647346e-06, + "loss": 0.6688, + "step": 602 + }, + { + "epoch": 0.08, + "grad_norm": 1.4423713684082031, + "learning_rate": 9.941803168085711e-06, + "loss": 0.6483, + "step": 603 + }, + { + "epoch": 0.08, + "grad_norm": 1.7107197046279907, + "learning_rate": 9.941487070398928e-06, + "loss": 0.6004, + "step": 604 + }, + { + "epoch": 0.08, + "grad_norm": 1.115032434463501, + "learning_rate": 9.941170121641434e-06, + "loss": 0.5836, + "step": 605 + }, + { + "epoch": 0.08, + "grad_norm": 1.354966402053833, + "learning_rate": 9.940852321867821e-06, + "loss": 0.6629, + "step": 606 + }, + { + "epoch": 0.08, + "grad_norm": 1.070405125617981, + "learning_rate": 9.940533671132821e-06, + "loss": 0.6276, + "step": 607 + }, + { + "epoch": 0.08, + "grad_norm": 1.133563756942749, + "learning_rate": 9.940214169491318e-06, + "loss": 0.715, + "step": 608 + }, + { + "epoch": 0.08, + "grad_norm": 1.1300113201141357, + "learning_rate": 9.939893816998337e-06, + "loss": 0.6603, + "step": 609 + }, + { + "epoch": 0.08, + "grad_norm": 1.367214322090149, + "learning_rate": 9.939572613709052e-06, + "loss": 0.6713, + "step": 610 + }, + { + "epoch": 0.08, + "grad_norm": 1.45294988155365, + "learning_rate": 9.939250559678787e-06, + "loss": 0.579, + "step": 611 + }, + { + "epoch": 0.08, + "grad_norm": 1.3254419565200806, + "learning_rate": 9.938927654963007e-06, + "loss": 0.579, + "step": 612 + }, + { + "epoch": 0.08, + "grad_norm": 1.1809929609298706, + "learning_rate": 9.938603899617323e-06, + "loss": 0.6424, + "step": 613 + }, + { + "epoch": 0.08, + "grad_norm": 1.836148977279663, + "learning_rate": 9.9382792936975e-06, + "loss": 0.598, + "step": 614 + }, + { + "epoch": 0.08, + "grad_norm": 1.1036103963851929, + "learning_rate": 9.937953837259444e-06, + "loss": 0.5946, + "step": 615 + }, + { + "epoch": 0.08, + "grad_norm": 1.2749247550964355, + "learning_rate": 9.937627530359206e-06, + "loss": 0.6951, + "step": 616 + }, + { + "epoch": 0.08, + "grad_norm": 1.2078628540039062, + "learning_rate": 9.937300373052987e-06, + "loss": 0.6501, + "step": 617 + }, + { + "epoch": 0.08, + "grad_norm": 1.239443302154541, + "learning_rate": 9.936972365397133e-06, + "loss": 0.6449, + "step": 618 + }, + { + "epoch": 0.08, + "grad_norm": 1.7138959169387817, + "learning_rate": 9.936643507448138e-06, + "loss": 0.714, + "step": 619 + }, + { + "epoch": 0.08, + "grad_norm": 1.0761655569076538, + "learning_rate": 9.93631379926264e-06, + "loss": 0.6237, + "step": 620 + }, + { + "epoch": 0.08, + "grad_norm": 1.2124683856964111, + "learning_rate": 9.935983240897422e-06, + "loss": 0.6217, + "step": 621 + }, + { + "epoch": 0.08, + "grad_norm": 1.4442805051803589, + "learning_rate": 9.93565183240942e-06, + "loss": 0.6459, + "step": 622 + }, + { + "epoch": 0.08, + "grad_norm": 1.2149344682693481, + "learning_rate": 9.935319573855711e-06, + "loss": 0.6465, + "step": 623 + }, + { + "epoch": 0.08, + "grad_norm": 1.1919761896133423, + "learning_rate": 9.934986465293518e-06, + "loss": 0.5959, + "step": 624 + }, + { + "epoch": 0.08, + "grad_norm": 1.3023431301116943, + "learning_rate": 9.934652506780214e-06, + "loss": 0.6582, + "step": 625 + }, + { + "epoch": 0.08, + "grad_norm": 1.142003059387207, + "learning_rate": 9.934317698373317e-06, + "loss": 0.6515, + "step": 626 + }, + { + "epoch": 0.08, + "grad_norm": 1.3569769859313965, + "learning_rate": 9.93398204013049e-06, + "loss": 0.7018, + "step": 627 + }, + { + "epoch": 0.08, + "grad_norm": 1.1718806028366089, + "learning_rate": 9.933645532109544e-06, + "loss": 0.5533, + "step": 628 + }, + { + "epoch": 0.08, + "grad_norm": 1.1356302499771118, + "learning_rate": 9.933308174368434e-06, + "loss": 0.6077, + "step": 629 + }, + { + "epoch": 0.08, + "grad_norm": 1.1580795049667358, + "learning_rate": 9.932969966965267e-06, + "loss": 0.6484, + "step": 630 + }, + { + "epoch": 0.08, + "grad_norm": 1.378556728363037, + "learning_rate": 9.932630909958287e-06, + "loss": 0.6518, + "step": 631 + }, + { + "epoch": 0.08, + "grad_norm": 1.0749226808547974, + "learning_rate": 9.932291003405893e-06, + "loss": 0.6614, + "step": 632 + }, + { + "epoch": 0.08, + "grad_norm": 1.4288339614868164, + "learning_rate": 9.931950247366625e-06, + "loss": 0.7169, + "step": 633 + }, + { + "epoch": 0.08, + "grad_norm": 2.120861291885376, + "learning_rate": 9.931608641899172e-06, + "loss": 0.6148, + "step": 634 + }, + { + "epoch": 0.08, + "grad_norm": 1.725447654724121, + "learning_rate": 9.931266187062372e-06, + "loss": 0.5805, + "step": 635 + }, + { + "epoch": 0.08, + "grad_norm": 1.3710678815841675, + "learning_rate": 9.930922882915201e-06, + "loss": 0.6566, + "step": 636 + }, + { + "epoch": 0.08, + "grad_norm": 1.3103550672531128, + "learning_rate": 9.93057872951679e-06, + "loss": 0.6861, + "step": 637 + }, + { + "epoch": 0.08, + "grad_norm": 1.3011488914489746, + "learning_rate": 9.93023372692641e-06, + "loss": 0.6222, + "step": 638 + }, + { + "epoch": 0.08, + "grad_norm": 1.2335342168807983, + "learning_rate": 9.92988787520348e-06, + "loss": 0.7169, + "step": 639 + }, + { + "epoch": 0.08, + "grad_norm": 1.844463586807251, + "learning_rate": 9.929541174407568e-06, + "loss": 0.6041, + "step": 640 + }, + { + "epoch": 0.08, + "grad_norm": 1.6094704866409302, + "learning_rate": 9.929193624598386e-06, + "loss": 0.6485, + "step": 641 + }, + { + "epoch": 0.08, + "grad_norm": 1.2318650484085083, + "learning_rate": 9.928845225835791e-06, + "loss": 0.7301, + "step": 642 + }, + { + "epoch": 0.08, + "grad_norm": 1.3937469720840454, + "learning_rate": 9.928495978179788e-06, + "loss": 0.658, + "step": 643 + }, + { + "epoch": 0.08, + "grad_norm": 1.3361302614212036, + "learning_rate": 9.928145881690527e-06, + "loss": 0.6271, + "step": 644 + }, + { + "epoch": 0.08, + "grad_norm": 1.590607762336731, + "learning_rate": 9.927794936428308e-06, + "loss": 0.6345, + "step": 645 + }, + { + "epoch": 0.08, + "grad_norm": 1.0950250625610352, + "learning_rate": 9.927443142453573e-06, + "loss": 0.5692, + "step": 646 + }, + { + "epoch": 0.08, + "grad_norm": 3.009345531463623, + "learning_rate": 9.92709049982691e-06, + "loss": 0.6775, + "step": 647 + }, + { + "epoch": 0.08, + "grad_norm": 1.5916540622711182, + "learning_rate": 9.926737008609055e-06, + "loss": 0.6645, + "step": 648 + }, + { + "epoch": 0.08, + "grad_norm": 1.0630630254745483, + "learning_rate": 9.92638266886089e-06, + "loss": 0.596, + "step": 649 + }, + { + "epoch": 0.08, + "grad_norm": 1.2478364706039429, + "learning_rate": 9.926027480643442e-06, + "loss": 0.6118, + "step": 650 + }, + { + "epoch": 0.08, + "grad_norm": 1.2530955076217651, + "learning_rate": 9.925671444017887e-06, + "loss": 0.658, + "step": 651 + }, + { + "epoch": 0.08, + "grad_norm": 1.3152321577072144, + "learning_rate": 9.925314559045543e-06, + "loss": 0.6898, + "step": 652 + }, + { + "epoch": 0.08, + "grad_norm": 1.2925292253494263, + "learning_rate": 9.924956825787877e-06, + "loss": 0.6682, + "step": 653 + }, + { + "epoch": 0.08, + "grad_norm": 1.472712755203247, + "learning_rate": 9.924598244306502e-06, + "loss": 0.6382, + "step": 654 + }, + { + "epoch": 0.08, + "grad_norm": 1.6111242771148682, + "learning_rate": 9.924238814663174e-06, + "loss": 0.5262, + "step": 655 + }, + { + "epoch": 0.08, + "grad_norm": 1.210485816001892, + "learning_rate": 9.9238785369198e-06, + "loss": 0.6865, + "step": 656 + }, + { + "epoch": 0.08, + "grad_norm": 1.3140826225280762, + "learning_rate": 9.92351741113843e-06, + "loss": 0.6448, + "step": 657 + }, + { + "epoch": 0.08, + "grad_norm": 1.125855803489685, + "learning_rate": 9.92315543738126e-06, + "loss": 0.6277, + "step": 658 + }, + { + "epoch": 0.08, + "grad_norm": 0.9021656513214111, + "learning_rate": 9.922792615710632e-06, + "loss": 0.6673, + "step": 659 + }, + { + "epoch": 0.08, + "grad_norm": 1.304652452468872, + "learning_rate": 9.922428946189038e-06, + "loss": 0.6477, + "step": 660 + }, + { + "epoch": 0.08, + "grad_norm": 1.2016267776489258, + "learning_rate": 9.922064428879108e-06, + "loss": 0.7264, + "step": 661 + }, + { + "epoch": 0.08, + "grad_norm": 1.3767685890197754, + "learning_rate": 9.921699063843624e-06, + "loss": 0.6343, + "step": 662 + }, + { + "epoch": 0.08, + "grad_norm": 1.5952433347702026, + "learning_rate": 9.921332851145516e-06, + "loss": 0.6332, + "step": 663 + }, + { + "epoch": 0.09, + "grad_norm": 1.1251158714294434, + "learning_rate": 9.920965790847852e-06, + "loss": 0.6287, + "step": 664 + }, + { + "epoch": 0.09, + "grad_norm": 1.7617466449737549, + "learning_rate": 9.920597883013854e-06, + "loss": 0.6315, + "step": 665 + }, + { + "epoch": 0.09, + "grad_norm": 1.5821009874343872, + "learning_rate": 9.920229127706884e-06, + "loss": 0.5471, + "step": 666 + }, + { + "epoch": 0.09, + "grad_norm": 1.1487936973571777, + "learning_rate": 9.919859524990456e-06, + "loss": 0.7097, + "step": 667 + }, + { + "epoch": 0.09, + "grad_norm": 1.3274495601654053, + "learning_rate": 9.919489074928223e-06, + "loss": 0.5928, + "step": 668 + }, + { + "epoch": 0.09, + "grad_norm": 1.1028430461883545, + "learning_rate": 9.91911777758399e-06, + "loss": 0.6415, + "step": 669 + }, + { + "epoch": 0.09, + "grad_norm": 1.2603744268417358, + "learning_rate": 9.918745633021706e-06, + "loss": 0.7122, + "step": 670 + }, + { + "epoch": 0.09, + "grad_norm": 1.1494251489639282, + "learning_rate": 9.918372641305463e-06, + "loss": 0.6002, + "step": 671 + }, + { + "epoch": 0.09, + "grad_norm": 1.1268144845962524, + "learning_rate": 9.917998802499502e-06, + "loss": 0.677, + "step": 672 + }, + { + "epoch": 0.09, + "grad_norm": 1.5660556554794312, + "learning_rate": 9.91762411666821e-06, + "loss": 0.6431, + "step": 673 + }, + { + "epoch": 0.09, + "grad_norm": 1.1017109155654907, + "learning_rate": 9.917248583876116e-06, + "loss": 0.7173, + "step": 674 + }, + { + "epoch": 0.09, + "grad_norm": 1.498296856880188, + "learning_rate": 9.916872204187902e-06, + "loss": 0.6641, + "step": 675 + }, + { + "epoch": 0.09, + "grad_norm": 1.0133637189865112, + "learning_rate": 9.91649497766839e-06, + "loss": 0.6279, + "step": 676 + }, + { + "epoch": 0.09, + "grad_norm": 1.355507731437683, + "learning_rate": 9.91611690438255e-06, + "loss": 0.5455, + "step": 677 + }, + { + "epoch": 0.09, + "grad_norm": 1.849907636642456, + "learning_rate": 9.915737984395498e-06, + "loss": 0.7237, + "step": 678 + }, + { + "epoch": 0.09, + "grad_norm": 1.1791698932647705, + "learning_rate": 9.915358217772491e-06, + "loss": 0.6523, + "step": 679 + }, + { + "epoch": 0.09, + "grad_norm": 1.3907570838928223, + "learning_rate": 9.91497760457894e-06, + "loss": 0.6599, + "step": 680 + }, + { + "epoch": 0.09, + "grad_norm": 1.5050311088562012, + "learning_rate": 9.914596144880399e-06, + "loss": 0.6543, + "step": 681 + }, + { + "epoch": 0.09, + "grad_norm": 1.2648143768310547, + "learning_rate": 9.914213838742566e-06, + "loss": 0.5308, + "step": 682 + }, + { + "epoch": 0.09, + "grad_norm": 1.1902687549591064, + "learning_rate": 9.913830686231281e-06, + "loss": 0.6256, + "step": 683 + }, + { + "epoch": 0.09, + "grad_norm": 1.0023192167282104, + "learning_rate": 9.913446687412541e-06, + "loss": 0.6192, + "step": 684 + }, + { + "epoch": 0.09, + "grad_norm": 1.0021028518676758, + "learning_rate": 9.913061842352475e-06, + "loss": 0.6225, + "step": 685 + }, + { + "epoch": 0.09, + "grad_norm": 2.0987367630004883, + "learning_rate": 9.912676151117371e-06, + "loss": 0.6478, + "step": 686 + }, + { + "epoch": 0.09, + "grad_norm": 1.1188229322433472, + "learning_rate": 9.912289613773653e-06, + "loss": 0.5543, + "step": 687 + }, + { + "epoch": 0.09, + "grad_norm": 1.3304802179336548, + "learning_rate": 9.911902230387896e-06, + "loss": 0.6396, + "step": 688 + }, + { + "epoch": 0.09, + "grad_norm": 1.0771523714065552, + "learning_rate": 9.911514001026816e-06, + "loss": 0.6503, + "step": 689 + }, + { + "epoch": 0.09, + "grad_norm": 1.0926355123519897, + "learning_rate": 9.911124925757282e-06, + "loss": 0.6256, + "step": 690 + }, + { + "epoch": 0.09, + "grad_norm": 1.1526925563812256, + "learning_rate": 9.9107350046463e-06, + "loss": 0.6386, + "step": 691 + }, + { + "epoch": 0.09, + "grad_norm": 1.0600298643112183, + "learning_rate": 9.91034423776103e-06, + "loss": 0.6494, + "step": 692 + }, + { + "epoch": 0.09, + "grad_norm": 1.321643352508545, + "learning_rate": 9.90995262516877e-06, + "loss": 0.5935, + "step": 693 + }, + { + "epoch": 0.09, + "grad_norm": 1.1695914268493652, + "learning_rate": 9.909560166936968e-06, + "loss": 0.7052, + "step": 694 + }, + { + "epoch": 0.09, + "grad_norm": 1.3552606105804443, + "learning_rate": 9.909166863133218e-06, + "loss": 0.6771, + "step": 695 + }, + { + "epoch": 0.09, + "grad_norm": 1.070600152015686, + "learning_rate": 9.908772713825259e-06, + "loss": 0.6489, + "step": 696 + }, + { + "epoch": 0.09, + "grad_norm": 1.541309118270874, + "learning_rate": 9.908377719080976e-06, + "loss": 0.7063, + "step": 697 + }, + { + "epoch": 0.09, + "grad_norm": 1.2539750337600708, + "learning_rate": 9.907981878968395e-06, + "loss": 0.7322, + "step": 698 + }, + { + "epoch": 0.09, + "grad_norm": 1.3420991897583008, + "learning_rate": 9.907585193555696e-06, + "loss": 0.5876, + "step": 699 + }, + { + "epoch": 0.09, + "grad_norm": 5.159241676330566, + "learning_rate": 9.907187662911195e-06, + "loss": 0.6435, + "step": 700 + }, + { + "epoch": 0.09, + "grad_norm": 1.334041953086853, + "learning_rate": 9.906789287103364e-06, + "loss": 0.67, + "step": 701 + }, + { + "epoch": 0.09, + "grad_norm": 1.2002819776535034, + "learning_rate": 9.90639006620081e-06, + "loss": 0.6775, + "step": 702 + }, + { + "epoch": 0.09, + "grad_norm": 1.6984732151031494, + "learning_rate": 9.905990000272295e-06, + "loss": 0.6121, + "step": 703 + }, + { + "epoch": 0.09, + "grad_norm": 1.382564663887024, + "learning_rate": 9.90558908938672e-06, + "loss": 0.6936, + "step": 704 + }, + { + "epoch": 0.09, + "grad_norm": 1.2500004768371582, + "learning_rate": 9.905187333613134e-06, + "loss": 0.6399, + "step": 705 + }, + { + "epoch": 0.09, + "grad_norm": 3.314711332321167, + "learning_rate": 9.904784733020732e-06, + "loss": 0.639, + "step": 706 + }, + { + "epoch": 0.09, + "grad_norm": 1.5994131565093994, + "learning_rate": 9.904381287678853e-06, + "loss": 0.5754, + "step": 707 + }, + { + "epoch": 0.09, + "grad_norm": 1.1765432357788086, + "learning_rate": 9.903976997656982e-06, + "loss": 0.5971, + "step": 708 + }, + { + "epoch": 0.09, + "grad_norm": 1.9509575366973877, + "learning_rate": 9.903571863024752e-06, + "loss": 0.6634, + "step": 709 + }, + { + "epoch": 0.09, + "grad_norm": 1.1447938680648804, + "learning_rate": 9.903165883851936e-06, + "loss": 0.6595, + "step": 710 + }, + { + "epoch": 0.09, + "grad_norm": 1.166501760482788, + "learning_rate": 9.902759060208456e-06, + "loss": 0.6014, + "step": 711 + }, + { + "epoch": 0.09, + "grad_norm": 1.1991925239562988, + "learning_rate": 9.90235139216438e-06, + "loss": 0.6136, + "step": 712 + }, + { + "epoch": 0.09, + "grad_norm": 1.2937411069869995, + "learning_rate": 9.901942879789923e-06, + "loss": 0.5599, + "step": 713 + }, + { + "epoch": 0.09, + "grad_norm": 1.0895254611968994, + "learning_rate": 9.90153352315544e-06, + "loss": 0.6231, + "step": 714 + }, + { + "epoch": 0.09, + "grad_norm": 1.2510082721710205, + "learning_rate": 9.901123322331434e-06, + "loss": 0.6087, + "step": 715 + }, + { + "epoch": 0.09, + "grad_norm": 0.9192419648170471, + "learning_rate": 9.900712277388556e-06, + "loss": 0.5165, + "step": 716 + }, + { + "epoch": 0.09, + "grad_norm": 1.6960457563400269, + "learning_rate": 9.900300388397597e-06, + "loss": 0.7067, + "step": 717 + }, + { + "epoch": 0.09, + "grad_norm": 1.0721408128738403, + "learning_rate": 9.899887655429501e-06, + "loss": 0.5991, + "step": 718 + }, + { + "epoch": 0.09, + "grad_norm": 1.2546442747116089, + "learning_rate": 9.89947407855535e-06, + "loss": 0.5862, + "step": 719 + }, + { + "epoch": 0.09, + "grad_norm": 1.3291999101638794, + "learning_rate": 9.899059657846373e-06, + "loss": 0.6478, + "step": 720 + }, + { + "epoch": 0.09, + "grad_norm": 1.157029628753662, + "learning_rate": 9.898644393373946e-06, + "loss": 0.6432, + "step": 721 + }, + { + "epoch": 0.09, + "grad_norm": 1.4094780683517456, + "learning_rate": 9.898228285209593e-06, + "loss": 0.582, + "step": 722 + }, + { + "epoch": 0.09, + "grad_norm": 1.2219111919403076, + "learning_rate": 9.897811333424977e-06, + "loss": 0.5938, + "step": 723 + }, + { + "epoch": 0.09, + "grad_norm": 1.2780520915985107, + "learning_rate": 9.897393538091912e-06, + "loss": 0.5572, + "step": 724 + }, + { + "epoch": 0.09, + "grad_norm": 2.0696425437927246, + "learning_rate": 9.896974899282352e-06, + "loss": 0.6494, + "step": 725 + }, + { + "epoch": 0.09, + "grad_norm": 1.4121901988983154, + "learning_rate": 9.896555417068401e-06, + "loss": 0.6419, + "step": 726 + }, + { + "epoch": 0.09, + "grad_norm": 1.3450160026550293, + "learning_rate": 9.896135091522305e-06, + "loss": 0.7238, + "step": 727 + }, + { + "epoch": 0.09, + "grad_norm": 1.0523320436477661, + "learning_rate": 9.895713922716457e-06, + "loss": 0.5947, + "step": 728 + }, + { + "epoch": 0.09, + "grad_norm": 1.5163980722427368, + "learning_rate": 9.895291910723396e-06, + "loss": 0.5785, + "step": 729 + }, + { + "epoch": 0.09, + "grad_norm": 1.4099388122558594, + "learning_rate": 9.894869055615803e-06, + "loss": 0.5904, + "step": 730 + }, + { + "epoch": 0.09, + "grad_norm": 0.9844495058059692, + "learning_rate": 9.894445357466507e-06, + "loss": 0.6382, + "step": 731 + }, + { + "epoch": 0.09, + "grad_norm": 1.2758938074111938, + "learning_rate": 9.894020816348483e-06, + "loss": 0.6405, + "step": 732 + }, + { + "epoch": 0.09, + "grad_norm": 1.246328353881836, + "learning_rate": 9.893595432334846e-06, + "loss": 0.6006, + "step": 733 + }, + { + "epoch": 0.09, + "grad_norm": 1.2072193622589111, + "learning_rate": 9.893169205498864e-06, + "loss": 0.6452, + "step": 734 + }, + { + "epoch": 0.09, + "grad_norm": 1.2706694602966309, + "learning_rate": 9.892742135913942e-06, + "loss": 0.619, + "step": 735 + }, + { + "epoch": 0.09, + "grad_norm": 1.2876821756362915, + "learning_rate": 9.892314223653636e-06, + "loss": 0.6796, + "step": 736 + }, + { + "epoch": 0.09, + "grad_norm": 1.3346693515777588, + "learning_rate": 9.891885468791645e-06, + "loss": 0.6494, + "step": 737 + }, + { + "epoch": 0.09, + "grad_norm": 1.2320876121520996, + "learning_rate": 9.891455871401814e-06, + "loss": 0.6095, + "step": 738 + }, + { + "epoch": 0.09, + "grad_norm": 1.2823511362075806, + "learning_rate": 9.89102543155813e-06, + "loss": 0.7429, + "step": 739 + }, + { + "epoch": 0.09, + "grad_norm": 1.3542308807373047, + "learning_rate": 9.890594149334731e-06, + "loss": 0.6251, + "step": 740 + }, + { + "epoch": 0.09, + "grad_norm": 2.2784230709075928, + "learning_rate": 9.890162024805895e-06, + "loss": 0.693, + "step": 741 + }, + { + "epoch": 0.1, + "grad_norm": 1.167242169380188, + "learning_rate": 9.889729058046045e-06, + "loss": 0.6828, + "step": 742 + }, + { + "epoch": 0.1, + "grad_norm": 1.067370891571045, + "learning_rate": 9.889295249129754e-06, + "loss": 0.5998, + "step": 743 + }, + { + "epoch": 0.1, + "grad_norm": 1.2341341972351074, + "learning_rate": 9.888860598131733e-06, + "loss": 0.5873, + "step": 744 + }, + { + "epoch": 0.1, + "grad_norm": 1.1772515773773193, + "learning_rate": 9.888425105126845e-06, + "loss": 0.6183, + "step": 745 + }, + { + "epoch": 0.1, + "grad_norm": 0.9825636744499207, + "learning_rate": 9.887988770190093e-06, + "loss": 0.6205, + "step": 746 + }, + { + "epoch": 0.1, + "grad_norm": 1.7665997743606567, + "learning_rate": 9.887551593396628e-06, + "loss": 0.6367, + "step": 747 + }, + { + "epoch": 0.1, + "grad_norm": 1.423568606376648, + "learning_rate": 9.887113574821743e-06, + "loss": 0.635, + "step": 748 + }, + { + "epoch": 0.1, + "grad_norm": 1.1755050420761108, + "learning_rate": 9.88667471454088e-06, + "loss": 0.5415, + "step": 749 + }, + { + "epoch": 0.1, + "grad_norm": 1.1046626567840576, + "learning_rate": 9.886235012629623e-06, + "loss": 0.6031, + "step": 750 + }, + { + "epoch": 0.1, + "grad_norm": 1.0809575319290161, + "learning_rate": 9.885794469163702e-06, + "loss": 0.6192, + "step": 751 + }, + { + "epoch": 0.1, + "grad_norm": 1.5112090110778809, + "learning_rate": 9.88535308421899e-06, + "loss": 0.6352, + "step": 752 + }, + { + "epoch": 0.1, + "grad_norm": 1.2142413854599, + "learning_rate": 9.884910857871508e-06, + "loss": 0.6814, + "step": 753 + }, + { + "epoch": 0.1, + "grad_norm": 1.40277099609375, + "learning_rate": 9.88446779019742e-06, + "loss": 0.6682, + "step": 754 + }, + { + "epoch": 0.1, + "grad_norm": 1.2482854127883911, + "learning_rate": 9.884023881273038e-06, + "loss": 0.7463, + "step": 755 + }, + { + "epoch": 0.1, + "grad_norm": 1.209838628768921, + "learning_rate": 9.883579131174813e-06, + "loss": 0.6347, + "step": 756 + }, + { + "epoch": 0.1, + "grad_norm": 1.2466986179351807, + "learning_rate": 9.883133539979342e-06, + "loss": 0.5901, + "step": 757 + }, + { + "epoch": 0.1, + "grad_norm": 1.2496932744979858, + "learning_rate": 9.882687107763374e-06, + "loss": 0.6438, + "step": 758 + }, + { + "epoch": 0.1, + "grad_norm": 1.1028106212615967, + "learning_rate": 9.882239834603798e-06, + "loss": 0.6648, + "step": 759 + }, + { + "epoch": 0.1, + "grad_norm": 1.3057689666748047, + "learning_rate": 9.881791720577643e-06, + "loss": 0.634, + "step": 760 + }, + { + "epoch": 0.1, + "grad_norm": 1.1680352687835693, + "learning_rate": 9.88134276576209e-06, + "loss": 0.6429, + "step": 761 + }, + { + "epoch": 0.1, + "grad_norm": 1.238755702972412, + "learning_rate": 9.880892970234464e-06, + "loss": 0.7034, + "step": 762 + }, + { + "epoch": 0.1, + "grad_norm": 1.2210910320281982, + "learning_rate": 9.880442334072232e-06, + "loss": 0.6392, + "step": 763 + }, + { + "epoch": 0.1, + "grad_norm": 1.2049086093902588, + "learning_rate": 9.879990857353007e-06, + "loss": 0.5435, + "step": 764 + }, + { + "epoch": 0.1, + "grad_norm": 1.2110891342163086, + "learning_rate": 9.879538540154545e-06, + "loss": 0.6339, + "step": 765 + }, + { + "epoch": 0.1, + "grad_norm": 1.2696014642715454, + "learning_rate": 9.87908538255475e-06, + "loss": 0.5821, + "step": 766 + }, + { + "epoch": 0.1, + "grad_norm": 1.348198652267456, + "learning_rate": 9.87863138463167e-06, + "loss": 0.7056, + "step": 767 + }, + { + "epoch": 0.1, + "grad_norm": 1.1112827062606812, + "learning_rate": 9.878176546463495e-06, + "loss": 0.5591, + "step": 768 + }, + { + "epoch": 0.1, + "grad_norm": 1.0482698678970337, + "learning_rate": 9.877720868128562e-06, + "loss": 0.6431, + "step": 769 + }, + { + "epoch": 0.1, + "grad_norm": 1.2081336975097656, + "learning_rate": 9.877264349705353e-06, + "loss": 0.6349, + "step": 770 + }, + { + "epoch": 0.1, + "grad_norm": 1.1520190238952637, + "learning_rate": 9.876806991272497e-06, + "loss": 0.6484, + "step": 771 + }, + { + "epoch": 0.1, + "grad_norm": 1.0996439456939697, + "learning_rate": 9.876348792908757e-06, + "loss": 0.7195, + "step": 772 + }, + { + "epoch": 0.1, + "grad_norm": 1.3486486673355103, + "learning_rate": 9.875889754693056e-06, + "loss": 0.675, + "step": 773 + }, + { + "epoch": 0.1, + "grad_norm": 1.2174115180969238, + "learning_rate": 9.875429876704448e-06, + "loss": 0.6746, + "step": 774 + }, + { + "epoch": 0.1, + "grad_norm": 1.3793160915374756, + "learning_rate": 9.874969159022143e-06, + "loss": 0.6585, + "step": 775 + }, + { + "epoch": 0.1, + "grad_norm": 1.225388526916504, + "learning_rate": 9.874507601725486e-06, + "loss": 0.6186, + "step": 776 + }, + { + "epoch": 0.1, + "grad_norm": 1.5237325429916382, + "learning_rate": 9.874045204893975e-06, + "loss": 0.6844, + "step": 777 + }, + { + "epoch": 0.1, + "grad_norm": 1.6531785726547241, + "learning_rate": 9.873581968607243e-06, + "loss": 0.617, + "step": 778 + }, + { + "epoch": 0.1, + "grad_norm": 1.1168662309646606, + "learning_rate": 9.87311789294508e-06, + "loss": 0.6097, + "step": 779 + }, + { + "epoch": 0.1, + "grad_norm": 1.150830864906311, + "learning_rate": 9.872652977987409e-06, + "loss": 0.5975, + "step": 780 + }, + { + "epoch": 0.1, + "grad_norm": 1.5129787921905518, + "learning_rate": 9.8721872238143e-06, + "loss": 0.594, + "step": 781 + }, + { + "epoch": 0.1, + "grad_norm": 1.2734906673431396, + "learning_rate": 9.871720630505975e-06, + "loss": 0.5761, + "step": 782 + }, + { + "epoch": 0.1, + "grad_norm": 1.1508820056915283, + "learning_rate": 9.871253198142792e-06, + "loss": 0.4496, + "step": 783 + }, + { + "epoch": 0.1, + "grad_norm": 1.085600733757019, + "learning_rate": 9.870784926805258e-06, + "loss": 0.5349, + "step": 784 + }, + { + "epoch": 0.1, + "grad_norm": 1.2703758478164673, + "learning_rate": 9.870315816574022e-06, + "loss": 0.501, + "step": 785 + }, + { + "epoch": 0.1, + "grad_norm": 1.0857547521591187, + "learning_rate": 9.86984586752988e-06, + "loss": 0.6313, + "step": 786 + }, + { + "epoch": 0.1, + "grad_norm": 1.0965255498886108, + "learning_rate": 9.869375079753772e-06, + "loss": 0.6636, + "step": 787 + }, + { + "epoch": 0.1, + "grad_norm": 1.169797658920288, + "learning_rate": 9.868903453326776e-06, + "loss": 0.7224, + "step": 788 + }, + { + "epoch": 0.1, + "grad_norm": 1.5188205242156982, + "learning_rate": 9.868430988330129e-06, + "loss": 0.6751, + "step": 789 + }, + { + "epoch": 0.1, + "grad_norm": 1.0894997119903564, + "learning_rate": 9.867957684845196e-06, + "loss": 0.6496, + "step": 790 + }, + { + "epoch": 0.1, + "grad_norm": 1.124399185180664, + "learning_rate": 9.867483542953498e-06, + "loss": 0.7526, + "step": 791 + }, + { + "epoch": 0.1, + "grad_norm": 1.100160837173462, + "learning_rate": 9.867008562736695e-06, + "loss": 0.6073, + "step": 792 + }, + { + "epoch": 0.1, + "grad_norm": 1.3616927862167358, + "learning_rate": 9.86653274427659e-06, + "loss": 0.6064, + "step": 793 + }, + { + "epoch": 0.1, + "grad_norm": 1.117180585861206, + "learning_rate": 9.86605608765514e-06, + "loss": 0.5775, + "step": 794 + }, + { + "epoch": 0.1, + "grad_norm": 1.1300320625305176, + "learning_rate": 9.865578592954433e-06, + "loss": 0.5791, + "step": 795 + }, + { + "epoch": 0.1, + "grad_norm": 1.2053767442703247, + "learning_rate": 9.86510026025671e-06, + "loss": 0.6415, + "step": 796 + }, + { + "epoch": 0.1, + "grad_norm": 1.2116667032241821, + "learning_rate": 9.864621089644356e-06, + "loss": 0.6081, + "step": 797 + }, + { + "epoch": 0.1, + "grad_norm": 1.282387614250183, + "learning_rate": 9.864141081199893e-06, + "loss": 0.6175, + "step": 798 + }, + { + "epoch": 0.1, + "grad_norm": 1.0022624731063843, + "learning_rate": 9.863660235005999e-06, + "loss": 0.6809, + "step": 799 + }, + { + "epoch": 0.1, + "grad_norm": 1.160461664199829, + "learning_rate": 9.863178551145484e-06, + "loss": 0.6991, + "step": 800 + }, + { + "epoch": 0.1, + "grad_norm": 1.3416701555252075, + "learning_rate": 9.862696029701316e-06, + "loss": 0.5662, + "step": 801 + }, + { + "epoch": 0.1, + "grad_norm": 1.1150017976760864, + "learning_rate": 9.86221267075659e-06, + "loss": 0.5754, + "step": 802 + }, + { + "epoch": 0.1, + "grad_norm": 1.3248844146728516, + "learning_rate": 9.861728474394564e-06, + "loss": 0.6595, + "step": 803 + }, + { + "epoch": 0.1, + "grad_norm": 1.3990356922149658, + "learning_rate": 9.861243440698622e-06, + "loss": 0.643, + "step": 804 + }, + { + "epoch": 0.1, + "grad_norm": 1.2016390562057495, + "learning_rate": 9.860757569752309e-06, + "loss": 0.6551, + "step": 805 + }, + { + "epoch": 0.1, + "grad_norm": 1.7251582145690918, + "learning_rate": 9.860270861639303e-06, + "loss": 0.6475, + "step": 806 + }, + { + "epoch": 0.1, + "grad_norm": 1.329265832901001, + "learning_rate": 9.859783316443429e-06, + "loss": 0.6333, + "step": 807 + }, + { + "epoch": 0.1, + "grad_norm": 1.7398295402526855, + "learning_rate": 9.859294934248658e-06, + "loss": 0.6681, + "step": 808 + }, + { + "epoch": 0.1, + "grad_norm": 1.1610875129699707, + "learning_rate": 9.858805715139102e-06, + "loss": 0.643, + "step": 809 + }, + { + "epoch": 0.1, + "grad_norm": 1.1122682094573975, + "learning_rate": 9.858315659199023e-06, + "loss": 0.599, + "step": 810 + }, + { + "epoch": 0.1, + "grad_norm": 1.0971416234970093, + "learning_rate": 9.857824766512818e-06, + "loss": 0.5669, + "step": 811 + }, + { + "epoch": 0.1, + "grad_norm": 1.1175240278244019, + "learning_rate": 9.857333037165038e-06, + "loss": 0.6802, + "step": 812 + }, + { + "epoch": 0.1, + "grad_norm": 1.5853863954544067, + "learning_rate": 9.85684047124037e-06, + "loss": 0.6693, + "step": 813 + }, + { + "epoch": 0.1, + "grad_norm": 1.930460810661316, + "learning_rate": 9.85634706882365e-06, + "loss": 0.5844, + "step": 814 + }, + { + "epoch": 0.1, + "grad_norm": 1.2769625186920166, + "learning_rate": 9.855852829999857e-06, + "loss": 0.672, + "step": 815 + }, + { + "epoch": 0.1, + "grad_norm": 1.0591260194778442, + "learning_rate": 9.855357754854112e-06, + "loss": 0.617, + "step": 816 + }, + { + "epoch": 0.1, + "grad_norm": 1.2074815034866333, + "learning_rate": 9.854861843471683e-06, + "loss": 0.6256, + "step": 817 + }, + { + "epoch": 0.1, + "grad_norm": 1.4503984451293945, + "learning_rate": 9.854365095937982e-06, + "loss": 0.6393, + "step": 818 + }, + { + "epoch": 0.1, + "grad_norm": 1.3213568925857544, + "learning_rate": 9.85386751233856e-06, + "loss": 0.74, + "step": 819 + }, + { + "epoch": 0.11, + "grad_norm": 1.231770396232605, + "learning_rate": 9.853369092759118e-06, + "loss": 0.6627, + "step": 820 + }, + { + "epoch": 0.11, + "grad_norm": 1.1289067268371582, + "learning_rate": 9.852869837285497e-06, + "loss": 0.73, + "step": 821 + }, + { + "epoch": 0.11, + "grad_norm": 1.8980963230133057, + "learning_rate": 9.852369746003685e-06, + "loss": 0.6039, + "step": 822 + }, + { + "epoch": 0.11, + "grad_norm": 3.139829397201538, + "learning_rate": 9.851868818999813e-06, + "loss": 0.5732, + "step": 823 + }, + { + "epoch": 0.11, + "grad_norm": 1.1257266998291016, + "learning_rate": 9.851367056360154e-06, + "loss": 0.6825, + "step": 824 + }, + { + "epoch": 0.11, + "grad_norm": 1.271867275238037, + "learning_rate": 9.85086445817113e-06, + "loss": 0.6534, + "step": 825 + }, + { + "epoch": 0.11, + "grad_norm": 1.1219916343688965, + "learning_rate": 9.850361024519298e-06, + "loss": 0.7195, + "step": 826 + }, + { + "epoch": 0.11, + "grad_norm": 1.4811643362045288, + "learning_rate": 9.849856755491367e-06, + "loss": 0.6099, + "step": 827 + }, + { + "epoch": 0.11, + "grad_norm": 1.355452299118042, + "learning_rate": 9.849351651174188e-06, + "loss": 0.6789, + "step": 828 + }, + { + "epoch": 0.11, + "grad_norm": 1.5336081981658936, + "learning_rate": 9.848845711654754e-06, + "loss": 0.6725, + "step": 829 + }, + { + "epoch": 0.11, + "grad_norm": 1.1408854722976685, + "learning_rate": 9.8483389370202e-06, + "loss": 0.6471, + "step": 830 + }, + { + "epoch": 0.11, + "grad_norm": 1.3404558897018433, + "learning_rate": 9.847831327357813e-06, + "loss": 0.7116, + "step": 831 + }, + { + "epoch": 0.11, + "grad_norm": 1.0383011102676392, + "learning_rate": 9.847322882755015e-06, + "loss": 0.7415, + "step": 832 + }, + { + "epoch": 0.11, + "grad_norm": 1.4247316122055054, + "learning_rate": 9.846813603299377e-06, + "loss": 0.6714, + "step": 833 + }, + { + "epoch": 0.11, + "grad_norm": 1.263272762298584, + "learning_rate": 9.846303489078608e-06, + "loss": 0.6267, + "step": 834 + }, + { + "epoch": 0.11, + "grad_norm": 0.9594109654426575, + "learning_rate": 9.845792540180569e-06, + "loss": 0.6712, + "step": 835 + }, + { + "epoch": 0.11, + "grad_norm": 1.1622424125671387, + "learning_rate": 9.845280756693262e-06, + "loss": 0.6655, + "step": 836 + }, + { + "epoch": 0.11, + "grad_norm": 1.1319442987442017, + "learning_rate": 9.844768138704825e-06, + "loss": 0.531, + "step": 837 + }, + { + "epoch": 0.11, + "grad_norm": 1.1469734907150269, + "learning_rate": 9.844254686303552e-06, + "loss": 0.5315, + "step": 838 + }, + { + "epoch": 0.11, + "grad_norm": 1.3223508596420288, + "learning_rate": 9.843740399577872e-06, + "loss": 0.5838, + "step": 839 + }, + { + "epoch": 0.11, + "grad_norm": 1.5986974239349365, + "learning_rate": 9.843225278616363e-06, + "loss": 0.7203, + "step": 840 + }, + { + "epoch": 0.11, + "grad_norm": 1.020804762840271, + "learning_rate": 9.842709323507737e-06, + "loss": 0.6836, + "step": 841 + }, + { + "epoch": 0.11, + "grad_norm": 1.5095237493515015, + "learning_rate": 9.842192534340864e-06, + "loss": 0.6815, + "step": 842 + }, + { + "epoch": 0.11, + "grad_norm": 1.7738572359085083, + "learning_rate": 9.84167491120475e-06, + "loss": 0.6532, + "step": 843 + }, + { + "epoch": 0.11, + "grad_norm": 1.0000008344650269, + "learning_rate": 9.84115645418854e-06, + "loss": 0.6143, + "step": 844 + }, + { + "epoch": 0.11, + "grad_norm": 1.1826478242874146, + "learning_rate": 9.840637163381534e-06, + "loss": 0.6351, + "step": 845 + }, + { + "epoch": 0.11, + "grad_norm": 1.4132177829742432, + "learning_rate": 9.840117038873165e-06, + "loss": 0.744, + "step": 846 + }, + { + "epoch": 0.11, + "grad_norm": 1.2735340595245361, + "learning_rate": 9.839596080753015e-06, + "loss": 0.7583, + "step": 847 + }, + { + "epoch": 0.11, + "grad_norm": 1.9481948614120483, + "learning_rate": 9.83907428911081e-06, + "loss": 0.6783, + "step": 848 + }, + { + "epoch": 0.11, + "grad_norm": 1.3666658401489258, + "learning_rate": 9.838551664036414e-06, + "loss": 0.6517, + "step": 849 + }, + { + "epoch": 0.11, + "grad_norm": 1.2822260856628418, + "learning_rate": 9.838028205619843e-06, + "loss": 0.6163, + "step": 850 + }, + { + "epoch": 0.11, + "grad_norm": 1.2314839363098145, + "learning_rate": 9.837503913951249e-06, + "loss": 0.6752, + "step": 851 + }, + { + "epoch": 0.11, + "grad_norm": 1.1731816530227661, + "learning_rate": 9.836978789120933e-06, + "loss": 0.648, + "step": 852 + }, + { + "epoch": 0.11, + "grad_norm": 1.2162363529205322, + "learning_rate": 9.836452831219334e-06, + "loss": 0.5972, + "step": 853 + }, + { + "epoch": 0.11, + "grad_norm": 1.1823762655258179, + "learning_rate": 9.83592604033704e-06, + "loss": 0.5839, + "step": 854 + }, + { + "epoch": 0.11, + "grad_norm": 1.037787675857544, + "learning_rate": 9.835398416564781e-06, + "loss": 0.6314, + "step": 855 + }, + { + "epoch": 0.11, + "grad_norm": 2.1973769664764404, + "learning_rate": 9.834869959993427e-06, + "loss": 0.6461, + "step": 856 + }, + { + "epoch": 0.11, + "grad_norm": 1.4603095054626465, + "learning_rate": 9.834340670713994e-06, + "loss": 0.548, + "step": 857 + }, + { + "epoch": 0.11, + "grad_norm": 1.1810789108276367, + "learning_rate": 9.833810548817644e-06, + "loss": 0.5911, + "step": 858 + }, + { + "epoch": 0.11, + "grad_norm": 1.5874276161193848, + "learning_rate": 9.833279594395676e-06, + "loss": 0.6555, + "step": 859 + }, + { + "epoch": 0.11, + "grad_norm": 1.2780070304870605, + "learning_rate": 9.832747807539538e-06, + "loss": 0.6349, + "step": 860 + }, + { + "epoch": 0.11, + "grad_norm": 1.2569433450698853, + "learning_rate": 9.83221518834082e-06, + "loss": 0.7218, + "step": 861 + }, + { + "epoch": 0.11, + "grad_norm": 1.1834901571273804, + "learning_rate": 9.831681736891255e-06, + "loss": 0.5845, + "step": 862 + }, + { + "epoch": 0.11, + "grad_norm": 1.2185614109039307, + "learning_rate": 9.831147453282717e-06, + "loss": 0.6264, + "step": 863 + }, + { + "epoch": 0.11, + "grad_norm": 1.341444969177246, + "learning_rate": 9.830612337607227e-06, + "loss": 0.7407, + "step": 864 + }, + { + "epoch": 0.11, + "grad_norm": 1.4523122310638428, + "learning_rate": 9.830076389956949e-06, + "loss": 0.6082, + "step": 865 + }, + { + "epoch": 0.11, + "grad_norm": 1.6930763721466064, + "learning_rate": 9.829539610424186e-06, + "loss": 0.621, + "step": 866 + }, + { + "epoch": 0.11, + "grad_norm": 1.5043352842330933, + "learning_rate": 9.82900199910139e-06, + "loss": 0.6928, + "step": 867 + }, + { + "epoch": 0.11, + "grad_norm": 1.6809231042861938, + "learning_rate": 9.828463556081153e-06, + "loss": 0.6384, + "step": 868 + }, + { + "epoch": 0.11, + "grad_norm": 1.4405546188354492, + "learning_rate": 9.827924281456211e-06, + "loss": 0.6553, + "step": 869 + }, + { + "epoch": 0.11, + "grad_norm": 1.1939656734466553, + "learning_rate": 9.827384175319443e-06, + "loss": 0.6822, + "step": 870 + }, + { + "epoch": 0.11, + "grad_norm": 1.065596103668213, + "learning_rate": 9.82684323776387e-06, + "loss": 0.6755, + "step": 871 + }, + { + "epoch": 0.11, + "grad_norm": 1.822675347328186, + "learning_rate": 9.82630146888266e-06, + "loss": 0.5842, + "step": 872 + }, + { + "epoch": 0.11, + "grad_norm": 1.0467437505722046, + "learning_rate": 9.825758868769117e-06, + "loss": 0.6246, + "step": 873 + }, + { + "epoch": 0.11, + "grad_norm": 1.2898199558258057, + "learning_rate": 9.8252154375167e-06, + "loss": 0.6213, + "step": 874 + }, + { + "epoch": 0.11, + "grad_norm": 1.1529115438461304, + "learning_rate": 9.824671175218999e-06, + "loss": 0.6626, + "step": 875 + }, + { + "epoch": 0.11, + "grad_norm": 1.5466557741165161, + "learning_rate": 9.82412608196975e-06, + "loss": 0.5785, + "step": 876 + }, + { + "epoch": 0.11, + "grad_norm": 0.961628794670105, + "learning_rate": 9.82358015786284e-06, + "loss": 0.5305, + "step": 877 + }, + { + "epoch": 0.11, + "grad_norm": 1.2553181648254395, + "learning_rate": 9.823033402992292e-06, + "loss": 0.6527, + "step": 878 + }, + { + "epoch": 0.11, + "grad_norm": 1.308699607849121, + "learning_rate": 9.82248581745227e-06, + "loss": 0.6968, + "step": 879 + }, + { + "epoch": 0.11, + "grad_norm": 2.1388115882873535, + "learning_rate": 9.821937401337086e-06, + "loss": 0.71, + "step": 880 + }, + { + "epoch": 0.11, + "grad_norm": 1.1120529174804688, + "learning_rate": 9.821388154741198e-06, + "loss": 0.7006, + "step": 881 + }, + { + "epoch": 0.11, + "grad_norm": 1.2875672578811646, + "learning_rate": 9.820838077759195e-06, + "loss": 0.638, + "step": 882 + }, + { + "epoch": 0.11, + "grad_norm": 1.338017463684082, + "learning_rate": 9.820287170485822e-06, + "loss": 0.5622, + "step": 883 + }, + { + "epoch": 0.11, + "grad_norm": 1.2129225730895996, + "learning_rate": 9.81973543301596e-06, + "loss": 0.6085, + "step": 884 + }, + { + "epoch": 0.11, + "grad_norm": 1.669423222541809, + "learning_rate": 9.819182865444633e-06, + "loss": 0.6449, + "step": 885 + }, + { + "epoch": 0.11, + "grad_norm": 1.1424647569656372, + "learning_rate": 9.81862946786701e-06, + "loss": 0.5807, + "step": 886 + }, + { + "epoch": 0.11, + "grad_norm": 1.2412691116333008, + "learning_rate": 9.818075240378406e-06, + "loss": 0.6645, + "step": 887 + }, + { + "epoch": 0.11, + "grad_norm": 1.2484862804412842, + "learning_rate": 9.817520183074273e-06, + "loss": 0.6166, + "step": 888 + }, + { + "epoch": 0.11, + "grad_norm": 1.2282848358154297, + "learning_rate": 9.816964296050208e-06, + "loss": 0.5975, + "step": 889 + }, + { + "epoch": 0.11, + "grad_norm": 1.0314067602157593, + "learning_rate": 9.81640757940195e-06, + "loss": 0.6313, + "step": 890 + }, + { + "epoch": 0.11, + "grad_norm": 1.192475438117981, + "learning_rate": 9.815850033225384e-06, + "loss": 0.6075, + "step": 891 + }, + { + "epoch": 0.11, + "grad_norm": 1.1968879699707031, + "learning_rate": 9.815291657616537e-06, + "loss": 0.6369, + "step": 892 + }, + { + "epoch": 0.11, + "grad_norm": 1.4490315914154053, + "learning_rate": 9.814732452671575e-06, + "loss": 0.6103, + "step": 893 + }, + { + "epoch": 0.11, + "grad_norm": 1.05226731300354, + "learning_rate": 9.81417241848681e-06, + "loss": 0.6934, + "step": 894 + }, + { + "epoch": 0.11, + "grad_norm": 1.841873049736023, + "learning_rate": 9.813611555158701e-06, + "loss": 0.6649, + "step": 895 + }, + { + "epoch": 0.11, + "grad_norm": 1.2420374155044556, + "learning_rate": 9.81304986278384e-06, + "loss": 0.6011, + "step": 896 + }, + { + "epoch": 0.11, + "grad_norm": 1.2118569612503052, + "learning_rate": 9.81248734145897e-06, + "loss": 0.6009, + "step": 897 + }, + { + "epoch": 0.12, + "grad_norm": 1.2946357727050781, + "learning_rate": 9.811923991280972e-06, + "loss": 0.6431, + "step": 898 + }, + { + "epoch": 0.12, + "grad_norm": 1.1636987924575806, + "learning_rate": 9.811359812346872e-06, + "loss": 0.741, + "step": 899 + }, + { + "epoch": 0.12, + "grad_norm": 1.2798045873641968, + "learning_rate": 9.810794804753838e-06, + "loss": 0.5853, + "step": 900 + }, + { + "epoch": 0.12, + "grad_norm": 1.2087842226028442, + "learning_rate": 9.810228968599185e-06, + "loss": 0.675, + "step": 901 + }, + { + "epoch": 0.12, + "grad_norm": 1.2338616847991943, + "learning_rate": 9.809662303980361e-06, + "loss": 0.62, + "step": 902 + }, + { + "epoch": 0.12, + "grad_norm": 1.2784311771392822, + "learning_rate": 9.809094810994966e-06, + "loss": 0.6567, + "step": 903 + }, + { + "epoch": 0.12, + "grad_norm": 1.1460515260696411, + "learning_rate": 9.808526489740739e-06, + "loss": 0.6384, + "step": 904 + }, + { + "epoch": 0.12, + "grad_norm": 1.9330402612686157, + "learning_rate": 9.80795734031556e-06, + "loss": 0.5802, + "step": 905 + }, + { + "epoch": 0.12, + "grad_norm": 1.233659029006958, + "learning_rate": 9.807387362817455e-06, + "loss": 0.7364, + "step": 906 + }, + { + "epoch": 0.12, + "grad_norm": 3.561555862426758, + "learning_rate": 9.806816557344592e-06, + "loss": 0.6519, + "step": 907 + }, + { + "epoch": 0.12, + "grad_norm": 1.063315510749817, + "learning_rate": 9.806244923995275e-06, + "loss": 0.6765, + "step": 908 + }, + { + "epoch": 0.12, + "grad_norm": 1.2617602348327637, + "learning_rate": 9.805672462867966e-06, + "loss": 0.6217, + "step": 909 + }, + { + "epoch": 0.12, + "grad_norm": 1.2094106674194336, + "learning_rate": 9.80509917406125e-06, + "loss": 0.5438, + "step": 910 + }, + { + "epoch": 0.12, + "grad_norm": 1.0630391836166382, + "learning_rate": 9.804525057673871e-06, + "loss": 0.5533, + "step": 911 + }, + { + "epoch": 0.12, + "grad_norm": 1.280405879020691, + "learning_rate": 9.803950113804705e-06, + "loss": 0.6173, + "step": 912 + }, + { + "epoch": 0.12, + "grad_norm": 0.9976212978363037, + "learning_rate": 9.803374342552776e-06, + "loss": 0.6336, + "step": 913 + }, + { + "epoch": 0.12, + "grad_norm": 1.2395212650299072, + "learning_rate": 9.80279774401725e-06, + "loss": 0.639, + "step": 914 + }, + { + "epoch": 0.12, + "grad_norm": 1.0815329551696777, + "learning_rate": 9.802220318297432e-06, + "loss": 0.5816, + "step": 915 + }, + { + "epoch": 0.12, + "grad_norm": 1.3019007444381714, + "learning_rate": 9.801642065492773e-06, + "loss": 0.5943, + "step": 916 + }, + { + "epoch": 0.12, + "grad_norm": 1.2104711532592773, + "learning_rate": 9.801062985702865e-06, + "loss": 0.6206, + "step": 917 + }, + { + "epoch": 0.12, + "grad_norm": 1.823017954826355, + "learning_rate": 9.800483079027443e-06, + "loss": 0.6293, + "step": 918 + }, + { + "epoch": 0.12, + "grad_norm": 1.2773282527923584, + "learning_rate": 9.799902345566384e-06, + "loss": 0.5979, + "step": 919 + }, + { + "epoch": 0.12, + "grad_norm": 1.3274632692337036, + "learning_rate": 9.799320785419709e-06, + "loss": 0.6451, + "step": 920 + }, + { + "epoch": 0.12, + "grad_norm": 1.7359416484832764, + "learning_rate": 9.798738398687577e-06, + "loss": 0.6662, + "step": 921 + }, + { + "epoch": 0.12, + "grad_norm": 1.245658278465271, + "learning_rate": 9.798155185470296e-06, + "loss": 0.726, + "step": 922 + }, + { + "epoch": 0.12, + "grad_norm": 2.3268086910247803, + "learning_rate": 9.797571145868308e-06, + "loss": 0.6302, + "step": 923 + }, + { + "epoch": 0.12, + "grad_norm": 1.6008139848709106, + "learning_rate": 9.796986279982204e-06, + "loss": 0.6036, + "step": 924 + }, + { + "epoch": 0.12, + "grad_norm": 1.4884922504425049, + "learning_rate": 9.796400587912717e-06, + "loss": 0.5932, + "step": 925 + }, + { + "epoch": 0.12, + "grad_norm": 1.7857106924057007, + "learning_rate": 9.795814069760719e-06, + "loss": 0.7126, + "step": 926 + }, + { + "epoch": 0.12, + "grad_norm": 1.5720033645629883, + "learning_rate": 9.795226725627225e-06, + "loss": 0.6674, + "step": 927 + }, + { + "epoch": 0.12, + "grad_norm": 1.4257798194885254, + "learning_rate": 9.794638555613396e-06, + "loss": 0.6579, + "step": 928 + }, + { + "epoch": 0.12, + "grad_norm": 1.1718521118164062, + "learning_rate": 9.794049559820531e-06, + "loss": 0.7428, + "step": 929 + }, + { + "epoch": 0.12, + "grad_norm": 1.1296699047088623, + "learning_rate": 9.79345973835007e-06, + "loss": 0.6206, + "step": 930 + }, + { + "epoch": 0.12, + "grad_norm": 1.188733458518982, + "learning_rate": 9.7928690913036e-06, + "loss": 0.6334, + "step": 931 + }, + { + "epoch": 0.12, + "grad_norm": 1.2753381729125977, + "learning_rate": 9.792277618782848e-06, + "loss": 0.5784, + "step": 932 + }, + { + "epoch": 0.12, + "grad_norm": 1.5454579591751099, + "learning_rate": 9.791685320889683e-06, + "loss": 0.6151, + "step": 933 + }, + { + "epoch": 0.12, + "grad_norm": 1.210261344909668, + "learning_rate": 9.791092197726118e-06, + "loss": 0.6163, + "step": 934 + }, + { + "epoch": 0.12, + "grad_norm": 1.2641156911849976, + "learning_rate": 9.790498249394301e-06, + "loss": 0.5978, + "step": 935 + }, + { + "epoch": 0.12, + "grad_norm": 1.322339415550232, + "learning_rate": 9.789903475996533e-06, + "loss": 0.6684, + "step": 936 + }, + { + "epoch": 0.12, + "grad_norm": 1.0136418342590332, + "learning_rate": 9.78930787763525e-06, + "loss": 0.6621, + "step": 937 + }, + { + "epoch": 0.12, + "grad_norm": 1.7304836511611938, + "learning_rate": 9.78871145441303e-06, + "loss": 0.679, + "step": 938 + }, + { + "epoch": 0.12, + "grad_norm": 1.1343514919281006, + "learning_rate": 9.788114206432597e-06, + "loss": 0.6472, + "step": 939 + }, + { + "epoch": 0.12, + "grad_norm": 1.1854240894317627, + "learning_rate": 9.787516133796816e-06, + "loss": 0.5903, + "step": 940 + }, + { + "epoch": 0.12, + "grad_norm": 1.0253652334213257, + "learning_rate": 9.786917236608687e-06, + "loss": 0.6461, + "step": 941 + }, + { + "epoch": 0.12, + "grad_norm": 1.077979564666748, + "learning_rate": 9.786317514971364e-06, + "loss": 0.6138, + "step": 942 + }, + { + "epoch": 0.12, + "grad_norm": 1.2683407068252563, + "learning_rate": 9.785716968988133e-06, + "loss": 0.6181, + "step": 943 + }, + { + "epoch": 0.12, + "grad_norm": 1.0985451936721802, + "learning_rate": 9.785115598762429e-06, + "loss": 0.6436, + "step": 944 + }, + { + "epoch": 0.12, + "grad_norm": 1.1492769718170166, + "learning_rate": 9.784513404397823e-06, + "loss": 0.6205, + "step": 945 + }, + { + "epoch": 0.12, + "grad_norm": 1.1750531196594238, + "learning_rate": 9.783910385998034e-06, + "loss": 0.6014, + "step": 946 + }, + { + "epoch": 0.12, + "grad_norm": 1.1315195560455322, + "learning_rate": 9.783306543666916e-06, + "loss": 0.6288, + "step": 947 + }, + { + "epoch": 0.12, + "grad_norm": 1.1699100732803345, + "learning_rate": 9.78270187750847e-06, + "loss": 0.6545, + "step": 948 + }, + { + "epoch": 0.12, + "grad_norm": 1.1685606241226196, + "learning_rate": 9.782096387626839e-06, + "loss": 0.6522, + "step": 949 + }, + { + "epoch": 0.12, + "grad_norm": 1.6117463111877441, + "learning_rate": 9.781490074126305e-06, + "loss": 0.593, + "step": 950 + }, + { + "epoch": 0.12, + "grad_norm": 1.1152682304382324, + "learning_rate": 9.780882937111292e-06, + "loss": 0.6196, + "step": 951 + }, + { + "epoch": 0.12, + "grad_norm": 1.2452787160873413, + "learning_rate": 9.780274976686369e-06, + "loss": 0.6037, + "step": 952 + }, + { + "epoch": 0.12, + "grad_norm": 1.2231415510177612, + "learning_rate": 9.779666192956243e-06, + "loss": 0.5567, + "step": 953 + }, + { + "epoch": 0.12, + "grad_norm": 1.1500630378723145, + "learning_rate": 9.779056586025766e-06, + "loss": 0.6495, + "step": 954 + }, + { + "epoch": 0.12, + "grad_norm": 1.3221664428710938, + "learning_rate": 9.77844615599993e-06, + "loss": 0.6374, + "step": 955 + }, + { + "epoch": 0.12, + "grad_norm": 1.1846741437911987, + "learning_rate": 9.777834902983869e-06, + "loss": 0.6598, + "step": 956 + }, + { + "epoch": 0.12, + "grad_norm": 1.418726921081543, + "learning_rate": 9.77722282708286e-06, + "loss": 0.7151, + "step": 957 + }, + { + "epoch": 0.12, + "grad_norm": 1.4351948499679565, + "learning_rate": 9.776609928402319e-06, + "loss": 0.6353, + "step": 958 + }, + { + "epoch": 0.12, + "grad_norm": 1.146484136581421, + "learning_rate": 9.775996207047804e-06, + "loss": 0.6261, + "step": 959 + }, + { + "epoch": 0.12, + "grad_norm": 1.1938923597335815, + "learning_rate": 9.775381663125021e-06, + "loss": 0.6846, + "step": 960 + }, + { + "epoch": 0.12, + "grad_norm": 1.3629497289657593, + "learning_rate": 9.774766296739808e-06, + "loss": 0.6335, + "step": 961 + }, + { + "epoch": 0.12, + "grad_norm": 1.2427971363067627, + "learning_rate": 9.774150107998153e-06, + "loss": 0.7141, + "step": 962 + }, + { + "epoch": 0.12, + "grad_norm": 1.049538016319275, + "learning_rate": 9.77353309700618e-06, + "loss": 0.6309, + "step": 963 + }, + { + "epoch": 0.12, + "grad_norm": 1.2472624778747559, + "learning_rate": 9.772915263870155e-06, + "loss": 0.6833, + "step": 964 + }, + { + "epoch": 0.12, + "grad_norm": 1.1295534372329712, + "learning_rate": 9.77229660869649e-06, + "loss": 0.5353, + "step": 965 + }, + { + "epoch": 0.12, + "grad_norm": 1.565324306488037, + "learning_rate": 9.771677131591734e-06, + "loss": 0.6867, + "step": 966 + }, + { + "epoch": 0.12, + "grad_norm": 1.5009647607803345, + "learning_rate": 9.771056832662581e-06, + "loss": 0.6216, + "step": 967 + }, + { + "epoch": 0.12, + "grad_norm": 1.4722943305969238, + "learning_rate": 9.770435712015863e-06, + "loss": 0.6958, + "step": 968 + }, + { + "epoch": 0.12, + "grad_norm": 1.2581201791763306, + "learning_rate": 9.769813769758556e-06, + "loss": 0.455, + "step": 969 + }, + { + "epoch": 0.12, + "grad_norm": 1.1791751384735107, + "learning_rate": 9.769191005997781e-06, + "loss": 0.6739, + "step": 970 + }, + { + "epoch": 0.12, + "grad_norm": 1.7730474472045898, + "learning_rate": 9.76856742084079e-06, + "loss": 0.6605, + "step": 971 + }, + { + "epoch": 0.12, + "grad_norm": 1.2831331491470337, + "learning_rate": 9.767943014394982e-06, + "loss": 0.6259, + "step": 972 + }, + { + "epoch": 0.12, + "grad_norm": 1.7151639461517334, + "learning_rate": 9.767317786767907e-06, + "loss": 0.6003, + "step": 973 + }, + { + "epoch": 0.12, + "grad_norm": 1.258449912071228, + "learning_rate": 9.76669173806724e-06, + "loss": 0.6271, + "step": 974 + }, + { + "epoch": 0.12, + "grad_norm": 1.1486563682556152, + "learning_rate": 9.766064868400808e-06, + "loss": 0.6293, + "step": 975 + }, + { + "epoch": 0.13, + "grad_norm": 1.290198564529419, + "learning_rate": 9.765437177876576e-06, + "loss": 0.6359, + "step": 976 + }, + { + "epoch": 0.13, + "grad_norm": 1.152216911315918, + "learning_rate": 9.764808666602651e-06, + "loss": 0.6303, + "step": 977 + }, + { + "epoch": 0.13, + "grad_norm": 1.1518402099609375, + "learning_rate": 9.764179334687284e-06, + "loss": 0.6305, + "step": 978 + }, + { + "epoch": 0.13, + "grad_norm": 1.1398547887802124, + "learning_rate": 9.76354918223886e-06, + "loss": 0.5392, + "step": 979 + }, + { + "epoch": 0.13, + "grad_norm": 1.361898422241211, + "learning_rate": 9.762918209365914e-06, + "loss": 0.6505, + "step": 980 + }, + { + "epoch": 0.13, + "grad_norm": 1.2657636404037476, + "learning_rate": 9.762286416177114e-06, + "loss": 0.631, + "step": 981 + }, + { + "epoch": 0.13, + "grad_norm": 1.4761649370193481, + "learning_rate": 9.761653802781276e-06, + "loss": 0.6788, + "step": 982 + }, + { + "epoch": 0.13, + "grad_norm": 1.5255000591278076, + "learning_rate": 9.761020369287358e-06, + "loss": 0.572, + "step": 983 + }, + { + "epoch": 0.13, + "grad_norm": 1.4596574306488037, + "learning_rate": 9.76038611580445e-06, + "loss": 0.6187, + "step": 984 + }, + { + "epoch": 0.13, + "grad_norm": 1.4689172506332397, + "learning_rate": 9.759751042441792e-06, + "loss": 0.5464, + "step": 985 + }, + { + "epoch": 0.13, + "grad_norm": 1.1188050508499146, + "learning_rate": 9.759115149308761e-06, + "loss": 0.6616, + "step": 986 + }, + { + "epoch": 0.13, + "grad_norm": 1.3672010898590088, + "learning_rate": 9.758478436514882e-06, + "loss": 0.7279, + "step": 987 + }, + { + "epoch": 0.13, + "grad_norm": 1.522152304649353, + "learning_rate": 9.75784090416981e-06, + "loss": 0.5857, + "step": 988 + }, + { + "epoch": 0.13, + "grad_norm": 1.1300358772277832, + "learning_rate": 9.757202552383349e-06, + "loss": 0.6825, + "step": 989 + }, + { + "epoch": 0.13, + "grad_norm": 1.252014398574829, + "learning_rate": 9.756563381265443e-06, + "loss": 0.6484, + "step": 990 + }, + { + "epoch": 0.13, + "grad_norm": 1.564777135848999, + "learning_rate": 9.755923390926173e-06, + "loss": 0.6302, + "step": 991 + }, + { + "epoch": 0.13, + "grad_norm": 1.6608165502548218, + "learning_rate": 9.755282581475769e-06, + "loss": 0.6619, + "step": 992 + }, + { + "epoch": 0.13, + "grad_norm": 2.2679226398468018, + "learning_rate": 9.754640953024595e-06, + "loss": 0.6166, + "step": 993 + }, + { + "epoch": 0.13, + "grad_norm": 1.119726538658142, + "learning_rate": 9.753998505683157e-06, + "loss": 0.6459, + "step": 994 + }, + { + "epoch": 0.13, + "grad_norm": 1.1786524057388306, + "learning_rate": 9.753355239562106e-06, + "loss": 0.6327, + "step": 995 + }, + { + "epoch": 0.13, + "grad_norm": 1.1883645057678223, + "learning_rate": 9.75271115477223e-06, + "loss": 0.6711, + "step": 996 + }, + { + "epoch": 0.13, + "grad_norm": 1.6034826040267944, + "learning_rate": 9.75206625142446e-06, + "loss": 0.6559, + "step": 997 + }, + { + "epoch": 0.13, + "grad_norm": 1.1943557262420654, + "learning_rate": 9.751420529629869e-06, + "loss": 0.5873, + "step": 998 + }, + { + "epoch": 0.13, + "grad_norm": 1.274271011352539, + "learning_rate": 9.750773989499665e-06, + "loss": 0.6383, + "step": 999 + }, + { + "epoch": 0.13, + "grad_norm": 1.3029786348342896, + "learning_rate": 9.750126631145207e-06, + "loss": 0.6523, + "step": 1000 + }, + { + "epoch": 0.13, + "grad_norm": 1.4161441326141357, + "learning_rate": 9.749478454677986e-06, + "loss": 0.6542, + "step": 1001 + }, + { + "epoch": 0.13, + "grad_norm": 1.0572928190231323, + "learning_rate": 9.748829460209638e-06, + "loss": 0.6725, + "step": 1002 + }, + { + "epoch": 0.13, + "grad_norm": 1.0757194757461548, + "learning_rate": 9.748179647851939e-06, + "loss": 0.5555, + "step": 1003 + }, + { + "epoch": 0.13, + "grad_norm": 2.006316900253296, + "learning_rate": 9.747529017716806e-06, + "loss": 0.6257, + "step": 1004 + }, + { + "epoch": 0.13, + "grad_norm": 1.1826972961425781, + "learning_rate": 9.746877569916297e-06, + "loss": 0.6045, + "step": 1005 + }, + { + "epoch": 0.13, + "grad_norm": 1.1543837785720825, + "learning_rate": 9.74622530456261e-06, + "loss": 0.5696, + "step": 1006 + }, + { + "epoch": 0.13, + "grad_norm": 1.5225355625152588, + "learning_rate": 9.745572221768085e-06, + "loss": 0.6135, + "step": 1007 + }, + { + "epoch": 0.13, + "grad_norm": 1.3935890197753906, + "learning_rate": 9.744918321645203e-06, + "loss": 0.6532, + "step": 1008 + }, + { + "epoch": 0.13, + "grad_norm": 1.2201215028762817, + "learning_rate": 9.744263604306582e-06, + "loss": 0.654, + "step": 1009 + }, + { + "epoch": 0.13, + "grad_norm": 1.4088938236236572, + "learning_rate": 9.743608069864988e-06, + "loss": 0.6236, + "step": 1010 + }, + { + "epoch": 0.13, + "grad_norm": 1.1685870885849, + "learning_rate": 9.742951718433321e-06, + "loss": 0.6695, + "step": 1011 + }, + { + "epoch": 0.13, + "grad_norm": 1.9566943645477295, + "learning_rate": 9.742294550124625e-06, + "loss": 0.6839, + "step": 1012 + }, + { + "epoch": 0.13, + "grad_norm": 1.222412347793579, + "learning_rate": 9.741636565052081e-06, + "loss": 0.614, + "step": 1013 + }, + { + "epoch": 0.13, + "grad_norm": 1.131980299949646, + "learning_rate": 9.740977763329018e-06, + "loss": 0.598, + "step": 1014 + }, + { + "epoch": 0.13, + "grad_norm": 1.1899813413619995, + "learning_rate": 9.740318145068902e-06, + "loss": 0.6678, + "step": 1015 + }, + { + "epoch": 0.13, + "grad_norm": 1.1226192712783813, + "learning_rate": 9.739657710385332e-06, + "loss": 0.6595, + "step": 1016 + }, + { + "epoch": 0.13, + "grad_norm": 3.9692671298980713, + "learning_rate": 9.73899645939206e-06, + "loss": 0.6617, + "step": 1017 + }, + { + "epoch": 0.13, + "grad_norm": 1.3050158023834229, + "learning_rate": 9.738334392202971e-06, + "loss": 0.6292, + "step": 1018 + }, + { + "epoch": 0.13, + "grad_norm": 1.244711995124817, + "learning_rate": 9.737671508932093e-06, + "loss": 0.6136, + "step": 1019 + }, + { + "epoch": 0.13, + "grad_norm": 1.4500633478164673, + "learning_rate": 9.737007809693595e-06, + "loss": 0.6355, + "step": 1020 + }, + { + "epoch": 0.13, + "grad_norm": 1.2068707942962646, + "learning_rate": 9.736343294601784e-06, + "loss": 0.5697, + "step": 1021 + }, + { + "epoch": 0.13, + "grad_norm": 1.1867763996124268, + "learning_rate": 9.735677963771112e-06, + "loss": 0.672, + "step": 1022 + }, + { + "epoch": 0.13, + "grad_norm": 1.561131477355957, + "learning_rate": 9.735011817316166e-06, + "loss": 0.6256, + "step": 1023 + }, + { + "epoch": 0.13, + "grad_norm": 4.072286128997803, + "learning_rate": 9.734344855351678e-06, + "loss": 0.6563, + "step": 1024 + }, + { + "epoch": 0.13, + "grad_norm": 1.22327721118927, + "learning_rate": 9.733677077992515e-06, + "loss": 0.6342, + "step": 1025 + }, + { + "epoch": 0.13, + "grad_norm": 1.7818228006362915, + "learning_rate": 9.733008485353694e-06, + "loss": 0.645, + "step": 1026 + }, + { + "epoch": 0.13, + "grad_norm": 1.4120960235595703, + "learning_rate": 9.732339077550364e-06, + "loss": 0.6384, + "step": 1027 + }, + { + "epoch": 0.13, + "grad_norm": 1.8205273151397705, + "learning_rate": 9.731668854697814e-06, + "loss": 0.6097, + "step": 1028 + }, + { + "epoch": 0.13, + "grad_norm": 1.6517037153244019, + "learning_rate": 9.730997816911479e-06, + "loss": 0.6568, + "step": 1029 + }, + { + "epoch": 0.13, + "grad_norm": 1.1279886960983276, + "learning_rate": 9.730325964306931e-06, + "loss": 0.5864, + "step": 1030 + }, + { + "epoch": 0.13, + "grad_norm": 1.175146222114563, + "learning_rate": 9.729653296999883e-06, + "loss": 0.6494, + "step": 1031 + }, + { + "epoch": 0.13, + "grad_norm": 1.0476584434509277, + "learning_rate": 9.72897981510619e-06, + "loss": 0.5538, + "step": 1032 + }, + { + "epoch": 0.13, + "grad_norm": 1.0945863723754883, + "learning_rate": 9.728305518741844e-06, + "loss": 0.589, + "step": 1033 + }, + { + "epoch": 0.13, + "grad_norm": 1.1936923265457153, + "learning_rate": 9.727630408022977e-06, + "loss": 0.5998, + "step": 1034 + }, + { + "epoch": 0.13, + "grad_norm": 1.1819581985473633, + "learning_rate": 9.726954483065864e-06, + "loss": 0.6506, + "step": 1035 + }, + { + "epoch": 0.13, + "grad_norm": 1.1652607917785645, + "learning_rate": 9.726277743986923e-06, + "loss": 0.6516, + "step": 1036 + }, + { + "epoch": 0.13, + "grad_norm": 1.4037582874298096, + "learning_rate": 9.725600190902704e-06, + "loss": 0.7143, + "step": 1037 + }, + { + "epoch": 0.13, + "grad_norm": 1.1429693698883057, + "learning_rate": 9.724921823929905e-06, + "loss": 0.6593, + "step": 1038 + }, + { + "epoch": 0.13, + "grad_norm": 1.021639347076416, + "learning_rate": 9.724242643185357e-06, + "loss": 0.6161, + "step": 1039 + }, + { + "epoch": 0.13, + "grad_norm": 1.2161128520965576, + "learning_rate": 9.72356264878604e-06, + "loss": 0.6372, + "step": 1040 + }, + { + "epoch": 0.13, + "grad_norm": 1.1113938093185425, + "learning_rate": 9.722881840849066e-06, + "loss": 0.63, + "step": 1041 + }, + { + "epoch": 0.13, + "grad_norm": 1.1752476692199707, + "learning_rate": 9.722200219491692e-06, + "loss": 0.7094, + "step": 1042 + }, + { + "epoch": 0.13, + "grad_norm": 1.142798662185669, + "learning_rate": 9.72151778483131e-06, + "loss": 0.6925, + "step": 1043 + }, + { + "epoch": 0.13, + "grad_norm": 1.1876989603042603, + "learning_rate": 9.72083453698546e-06, + "loss": 0.6793, + "step": 1044 + }, + { + "epoch": 0.13, + "grad_norm": 1.0826714038848877, + "learning_rate": 9.720150476071814e-06, + "loss": 0.6886, + "step": 1045 + }, + { + "epoch": 0.13, + "grad_norm": 2.2857930660247803, + "learning_rate": 9.71946560220819e-06, + "loss": 0.5505, + "step": 1046 + }, + { + "epoch": 0.13, + "grad_norm": 1.1982847452163696, + "learning_rate": 9.718779915512542e-06, + "loss": 0.651, + "step": 1047 + }, + { + "epoch": 0.13, + "grad_norm": 1.4459384679794312, + "learning_rate": 9.718093416102967e-06, + "loss": 0.6223, + "step": 1048 + }, + { + "epoch": 0.13, + "grad_norm": 1.1260446310043335, + "learning_rate": 9.717406104097702e-06, + "loss": 0.6248, + "step": 1049 + }, + { + "epoch": 0.13, + "grad_norm": 1.0635147094726562, + "learning_rate": 9.716717979615118e-06, + "loss": 0.5919, + "step": 1050 + }, + { + "epoch": 0.13, + "grad_norm": 1.1763317584991455, + "learning_rate": 9.716029042773733e-06, + "loss": 0.636, + "step": 1051 + }, + { + "epoch": 0.13, + "grad_norm": 1.2086479663848877, + "learning_rate": 9.715339293692203e-06, + "loss": 0.651, + "step": 1052 + }, + { + "epoch": 0.13, + "grad_norm": 1.3193252086639404, + "learning_rate": 9.714648732489322e-06, + "loss": 0.633, + "step": 1053 + }, + { + "epoch": 0.14, + "grad_norm": 1.6744786500930786, + "learning_rate": 9.713957359284025e-06, + "loss": 0.5887, + "step": 1054 + }, + { + "epoch": 0.14, + "grad_norm": 1.2904282808303833, + "learning_rate": 9.713265174195388e-06, + "loss": 0.68, + "step": 1055 + }, + { + "epoch": 0.14, + "grad_norm": 1.4331204891204834, + "learning_rate": 9.712572177342627e-06, + "loss": 0.5989, + "step": 1056 + }, + { + "epoch": 0.14, + "grad_norm": 0.9341638684272766, + "learning_rate": 9.711878368845094e-06, + "loss": 0.647, + "step": 1057 + }, + { + "epoch": 0.14, + "grad_norm": 1.3745675086975098, + "learning_rate": 9.711183748822284e-06, + "loss": 0.6102, + "step": 1058 + }, + { + "epoch": 0.14, + "grad_norm": 1.2217185497283936, + "learning_rate": 9.710488317393832e-06, + "loss": 0.6091, + "step": 1059 + }, + { + "epoch": 0.14, + "grad_norm": 2.1755502223968506, + "learning_rate": 9.709792074679511e-06, + "loss": 0.6482, + "step": 1060 + }, + { + "epoch": 0.14, + "grad_norm": 1.2285881042480469, + "learning_rate": 9.709095020799236e-06, + "loss": 0.7853, + "step": 1061 + }, + { + "epoch": 0.14, + "grad_norm": 1.1444462537765503, + "learning_rate": 9.70839715587306e-06, + "loss": 0.6351, + "step": 1062 + }, + { + "epoch": 0.14, + "grad_norm": 1.0068621635437012, + "learning_rate": 9.707698480021174e-06, + "loss": 0.7285, + "step": 1063 + }, + { + "epoch": 0.14, + "grad_norm": 1.1805295944213867, + "learning_rate": 9.706998993363913e-06, + "loss": 0.5867, + "step": 1064 + }, + { + "epoch": 0.14, + "grad_norm": 1.2351415157318115, + "learning_rate": 9.70629869602175e-06, + "loss": 0.5677, + "step": 1065 + }, + { + "epoch": 0.14, + "grad_norm": 1.22136652469635, + "learning_rate": 9.705597588115295e-06, + "loss": 0.647, + "step": 1066 + }, + { + "epoch": 0.14, + "grad_norm": 1.124104619026184, + "learning_rate": 9.7048956697653e-06, + "loss": 0.5704, + "step": 1067 + }, + { + "epoch": 0.14, + "grad_norm": 1.2475918531417847, + "learning_rate": 9.704192941092657e-06, + "loss": 0.6191, + "step": 1068 + }, + { + "epoch": 0.14, + "grad_norm": 1.219774842262268, + "learning_rate": 9.703489402218394e-06, + "loss": 0.7046, + "step": 1069 + }, + { + "epoch": 0.14, + "grad_norm": 1.1891069412231445, + "learning_rate": 9.702785053263685e-06, + "loss": 0.5343, + "step": 1070 + }, + { + "epoch": 0.14, + "grad_norm": 1.167614459991455, + "learning_rate": 9.702079894349838e-06, + "loss": 0.6189, + "step": 1071 + }, + { + "epoch": 0.14, + "grad_norm": 1.0407310724258423, + "learning_rate": 9.701373925598304e-06, + "loss": 0.6745, + "step": 1072 + }, + { + "epoch": 0.14, + "grad_norm": 1.5812993049621582, + "learning_rate": 9.70066714713067e-06, + "loss": 0.632, + "step": 1073 + }, + { + "epoch": 0.14, + "grad_norm": 1.2237656116485596, + "learning_rate": 9.699959559068664e-06, + "loss": 0.738, + "step": 1074 + }, + { + "epoch": 0.14, + "grad_norm": 1.2793015241622925, + "learning_rate": 9.699251161534153e-06, + "loss": 0.7889, + "step": 1075 + }, + { + "epoch": 0.14, + "grad_norm": 1.167056679725647, + "learning_rate": 9.698541954649145e-06, + "loss": 0.6292, + "step": 1076 + }, + { + "epoch": 0.14, + "grad_norm": 1.5537493228912354, + "learning_rate": 9.69783193853579e-06, + "loss": 0.6604, + "step": 1077 + }, + { + "epoch": 0.14, + "grad_norm": 1.2457462549209595, + "learning_rate": 9.697121113316367e-06, + "loss": 0.618, + "step": 1078 + }, + { + "epoch": 0.14, + "grad_norm": 1.6209555864334106, + "learning_rate": 9.696409479113306e-06, + "loss": 0.5909, + "step": 1079 + }, + { + "epoch": 0.14, + "grad_norm": 1.1111366748809814, + "learning_rate": 9.695697036049173e-06, + "loss": 0.6108, + "step": 1080 + }, + { + "epoch": 0.14, + "grad_norm": 1.2718392610549927, + "learning_rate": 9.694983784246665e-06, + "loss": 0.7539, + "step": 1081 + }, + { + "epoch": 0.14, + "grad_norm": 1.2289928197860718, + "learning_rate": 9.69426972382863e-06, + "loss": 0.5455, + "step": 1082 + }, + { + "epoch": 0.14, + "grad_norm": 1.0418940782546997, + "learning_rate": 9.693554854918052e-06, + "loss": 0.5804, + "step": 1083 + }, + { + "epoch": 0.14, + "grad_norm": 1.1238722801208496, + "learning_rate": 9.692839177638049e-06, + "loss": 0.6448, + "step": 1084 + }, + { + "epoch": 0.14, + "grad_norm": 1.0625381469726562, + "learning_rate": 9.692122692111883e-06, + "loss": 0.6399, + "step": 1085 + }, + { + "epoch": 0.14, + "grad_norm": 1.4463095664978027, + "learning_rate": 9.691405398462956e-06, + "loss": 0.6596, + "step": 1086 + }, + { + "epoch": 0.14, + "grad_norm": 1.1603282690048218, + "learning_rate": 9.690687296814803e-06, + "loss": 0.615, + "step": 1087 + }, + { + "epoch": 0.14, + "grad_norm": 1.1188900470733643, + "learning_rate": 9.689968387291107e-06, + "loss": 0.5733, + "step": 1088 + }, + { + "epoch": 0.14, + "grad_norm": 1.2128100395202637, + "learning_rate": 9.68924867001568e-06, + "loss": 0.6144, + "step": 1089 + }, + { + "epoch": 0.14, + "grad_norm": 1.295868158340454, + "learning_rate": 9.688528145112486e-06, + "loss": 0.6801, + "step": 1090 + }, + { + "epoch": 0.14, + "grad_norm": 1.2260469198226929, + "learning_rate": 9.687806812705616e-06, + "loss": 0.5645, + "step": 1091 + }, + { + "epoch": 0.14, + "grad_norm": 1.2258944511413574, + "learning_rate": 9.687084672919305e-06, + "loss": 0.5889, + "step": 1092 + }, + { + "epoch": 0.14, + "grad_norm": 1.4465066194534302, + "learning_rate": 9.686361725877926e-06, + "loss": 0.6063, + "step": 1093 + }, + { + "epoch": 0.14, + "grad_norm": 1.3785881996154785, + "learning_rate": 9.685637971705998e-06, + "loss": 0.5688, + "step": 1094 + }, + { + "epoch": 0.14, + "grad_norm": 1.1346842050552368, + "learning_rate": 9.684913410528164e-06, + "loss": 0.609, + "step": 1095 + }, + { + "epoch": 0.14, + "grad_norm": 1.1224477291107178, + "learning_rate": 9.68418804246922e-06, + "loss": 0.6036, + "step": 1096 + }, + { + "epoch": 0.14, + "grad_norm": 1.019282579421997, + "learning_rate": 9.683461867654098e-06, + "loss": 0.6989, + "step": 1097 + }, + { + "epoch": 0.14, + "grad_norm": 1.0686557292938232, + "learning_rate": 9.682734886207863e-06, + "loss": 0.586, + "step": 1098 + }, + { + "epoch": 0.14, + "grad_norm": 1.1529673337936401, + "learning_rate": 9.682007098255724e-06, + "loss": 0.5873, + "step": 1099 + }, + { + "epoch": 0.14, + "grad_norm": 1.140615463256836, + "learning_rate": 9.681278503923026e-06, + "loss": 0.5277, + "step": 1100 + }, + { + "epoch": 0.14, + "grad_norm": 1.3383703231811523, + "learning_rate": 9.680549103335259e-06, + "loss": 0.6375, + "step": 1101 + }, + { + "epoch": 0.14, + "grad_norm": 1.2095407247543335, + "learning_rate": 9.679818896618044e-06, + "loss": 0.7485, + "step": 1102 + }, + { + "epoch": 0.14, + "grad_norm": 1.5559945106506348, + "learning_rate": 9.679087883897142e-06, + "loss": 0.5913, + "step": 1103 + }, + { + "epoch": 0.14, + "grad_norm": 1.2311608791351318, + "learning_rate": 9.678356065298463e-06, + "loss": 0.6464, + "step": 1104 + }, + { + "epoch": 0.14, + "grad_norm": 1.1227388381958008, + "learning_rate": 9.677623440948039e-06, + "loss": 0.6501, + "step": 1105 + }, + { + "epoch": 0.14, + "grad_norm": 1.120994210243225, + "learning_rate": 9.676890010972055e-06, + "loss": 0.5631, + "step": 1106 + }, + { + "epoch": 0.14, + "grad_norm": 1.5825780630111694, + "learning_rate": 9.676155775496828e-06, + "loss": 0.6043, + "step": 1107 + }, + { + "epoch": 0.14, + "grad_norm": 1.1823532581329346, + "learning_rate": 9.675420734648814e-06, + "loss": 0.5822, + "step": 1108 + }, + { + "epoch": 0.14, + "grad_norm": 1.5761380195617676, + "learning_rate": 9.674684888554611e-06, + "loss": 0.6976, + "step": 1109 + }, + { + "epoch": 0.14, + "grad_norm": 1.2008821964263916, + "learning_rate": 9.673948237340951e-06, + "loss": 0.5651, + "step": 1110 + }, + { + "epoch": 0.14, + "grad_norm": 1.1293318271636963, + "learning_rate": 9.67321078113471e-06, + "loss": 0.6455, + "step": 1111 + }, + { + "epoch": 0.14, + "grad_norm": 1.1750528812408447, + "learning_rate": 9.6724725200629e-06, + "loss": 0.6694, + "step": 1112 + }, + { + "epoch": 0.14, + "grad_norm": 1.3616904020309448, + "learning_rate": 9.671733454252668e-06, + "loss": 0.6136, + "step": 1113 + }, + { + "epoch": 0.14, + "grad_norm": 1.360683560371399, + "learning_rate": 9.670993583831305e-06, + "loss": 0.6378, + "step": 1114 + }, + { + "epoch": 0.14, + "grad_norm": 1.126110553741455, + "learning_rate": 9.67025290892624e-06, + "loss": 0.584, + "step": 1115 + }, + { + "epoch": 0.14, + "grad_norm": 1.2391899824142456, + "learning_rate": 9.669511429665035e-06, + "loss": 0.6807, + "step": 1116 + }, + { + "epoch": 0.14, + "grad_norm": 1.075810432434082, + "learning_rate": 9.668769146175401e-06, + "loss": 0.6883, + "step": 1117 + }, + { + "epoch": 0.14, + "grad_norm": 0.9843566417694092, + "learning_rate": 9.668026058585175e-06, + "loss": 0.6793, + "step": 1118 + }, + { + "epoch": 0.14, + "grad_norm": 1.44784677028656, + "learning_rate": 9.667282167022343e-06, + "loss": 0.6444, + "step": 1119 + }, + { + "epoch": 0.14, + "grad_norm": 1.0862517356872559, + "learning_rate": 9.666537471615024e-06, + "loss": 0.5965, + "step": 1120 + }, + { + "epoch": 0.14, + "grad_norm": 1.401402235031128, + "learning_rate": 9.665791972491476e-06, + "loss": 0.6598, + "step": 1121 + }, + { + "epoch": 0.14, + "grad_norm": 1.1191637516021729, + "learning_rate": 9.665045669780098e-06, + "loss": 0.7505, + "step": 1122 + }, + { + "epoch": 0.14, + "grad_norm": 1.114385962486267, + "learning_rate": 9.664298563609422e-06, + "loss": 0.655, + "step": 1123 + }, + { + "epoch": 0.14, + "grad_norm": 4.621027946472168, + "learning_rate": 9.663550654108124e-06, + "loss": 0.58, + "step": 1124 + }, + { + "epoch": 0.14, + "grad_norm": 1.0535495281219482, + "learning_rate": 9.662801941405016e-06, + "loss": 0.5866, + "step": 1125 + }, + { + "epoch": 0.14, + "grad_norm": 1.036341905593872, + "learning_rate": 9.662052425629049e-06, + "loss": 0.5704, + "step": 1126 + }, + { + "epoch": 0.14, + "grad_norm": 1.1747620105743408, + "learning_rate": 9.661302106909311e-06, + "loss": 0.5937, + "step": 1127 + }, + { + "epoch": 0.14, + "grad_norm": 1.3271484375, + "learning_rate": 9.66055098537503e-06, + "loss": 0.6407, + "step": 1128 + }, + { + "epoch": 0.14, + "grad_norm": 1.1150386333465576, + "learning_rate": 9.659799061155573e-06, + "loss": 0.6461, + "step": 1129 + }, + { + "epoch": 0.14, + "grad_norm": 1.1988625526428223, + "learning_rate": 9.659046334380438e-06, + "loss": 0.6438, + "step": 1130 + }, + { + "epoch": 0.14, + "grad_norm": 1.3866018056869507, + "learning_rate": 9.658292805179272e-06, + "loss": 0.6631, + "step": 1131 + }, + { + "epoch": 0.15, + "grad_norm": 1.2857260704040527, + "learning_rate": 9.657538473681855e-06, + "loss": 0.6402, + "step": 1132 + }, + { + "epoch": 0.15, + "grad_norm": 1.7626216411590576, + "learning_rate": 9.656783340018103e-06, + "loss": 0.6901, + "step": 1133 + }, + { + "epoch": 0.15, + "grad_norm": 1.3316880464553833, + "learning_rate": 9.656027404318073e-06, + "loss": 0.655, + "step": 1134 + }, + { + "epoch": 0.15, + "grad_norm": 1.3176456689834595, + "learning_rate": 9.65527066671196e-06, + "loss": 0.6157, + "step": 1135 + }, + { + "epoch": 0.15, + "grad_norm": 1.27821683883667, + "learning_rate": 9.654513127330097e-06, + "loss": 0.752, + "step": 1136 + }, + { + "epoch": 0.15, + "grad_norm": 1.4629007577896118, + "learning_rate": 9.653754786302953e-06, + "loss": 0.6666, + "step": 1137 + }, + { + "epoch": 0.15, + "grad_norm": 1.0769317150115967, + "learning_rate": 9.652995643761139e-06, + "loss": 0.6648, + "step": 1138 + }, + { + "epoch": 0.15, + "grad_norm": 1.0857163667678833, + "learning_rate": 9.652235699835402e-06, + "loss": 0.5751, + "step": 1139 + }, + { + "epoch": 0.15, + "grad_norm": 1.2306089401245117, + "learning_rate": 9.651474954656626e-06, + "loss": 0.6447, + "step": 1140 + }, + { + "epoch": 0.15, + "grad_norm": 1.1074614524841309, + "learning_rate": 9.650713408355832e-06, + "loss": 0.6232, + "step": 1141 + }, + { + "epoch": 0.15, + "grad_norm": 1.069425106048584, + "learning_rate": 9.649951061064184e-06, + "loss": 0.6284, + "step": 1142 + }, + { + "epoch": 0.15, + "grad_norm": 1.4408040046691895, + "learning_rate": 9.649187912912978e-06, + "loss": 0.603, + "step": 1143 + }, + { + "epoch": 0.15, + "grad_norm": 1.965793490409851, + "learning_rate": 9.648423964033653e-06, + "loss": 0.609, + "step": 1144 + }, + { + "epoch": 0.15, + "grad_norm": 1.2444179058074951, + "learning_rate": 9.647659214557784e-06, + "loss": 0.6883, + "step": 1145 + }, + { + "epoch": 0.15, + "grad_norm": 1.3600820302963257, + "learning_rate": 9.646893664617082e-06, + "loss": 0.6466, + "step": 1146 + }, + { + "epoch": 0.15, + "grad_norm": 1.187540888786316, + "learning_rate": 9.646127314343398e-06, + "loss": 0.6567, + "step": 1147 + }, + { + "epoch": 0.15, + "grad_norm": 1.062788486480713, + "learning_rate": 9.64536016386872e-06, + "loss": 0.6549, + "step": 1148 + }, + { + "epoch": 0.15, + "grad_norm": 1.4273498058319092, + "learning_rate": 9.644592213325174e-06, + "loss": 0.573, + "step": 1149 + }, + { + "epoch": 0.15, + "grad_norm": 2.6031153202056885, + "learning_rate": 9.643823462845023e-06, + "loss": 0.6893, + "step": 1150 + }, + { + "epoch": 0.15, + "grad_norm": 1.0803120136260986, + "learning_rate": 9.643053912560672e-06, + "loss": 0.6714, + "step": 1151 + }, + { + "epoch": 0.15, + "grad_norm": 1.4752882719039917, + "learning_rate": 9.642283562604658e-06, + "loss": 0.6817, + "step": 1152 + }, + { + "epoch": 0.15, + "grad_norm": 1.3621220588684082, + "learning_rate": 9.641512413109656e-06, + "loss": 0.6725, + "step": 1153 + }, + { + "epoch": 0.15, + "grad_norm": 1.622391700744629, + "learning_rate": 9.640740464208484e-06, + "loss": 0.6874, + "step": 1154 + }, + { + "epoch": 0.15, + "grad_norm": 1.2322818040847778, + "learning_rate": 9.639967716034095e-06, + "loss": 0.6397, + "step": 1155 + }, + { + "epoch": 0.15, + "grad_norm": 1.1383590698242188, + "learning_rate": 9.639194168719577e-06, + "loss": 0.6025, + "step": 1156 + }, + { + "epoch": 0.15, + "grad_norm": 1.2494959831237793, + "learning_rate": 9.638419822398159e-06, + "loss": 0.6905, + "step": 1157 + }, + { + "epoch": 0.15, + "grad_norm": 1.1969854831695557, + "learning_rate": 9.637644677203205e-06, + "loss": 0.5309, + "step": 1158 + }, + { + "epoch": 0.15, + "grad_norm": 1.3315047025680542, + "learning_rate": 9.636868733268218e-06, + "loss": 0.691, + "step": 1159 + }, + { + "epoch": 0.15, + "grad_norm": 1.0063773393630981, + "learning_rate": 9.636091990726843e-06, + "loss": 0.5789, + "step": 1160 + }, + { + "epoch": 0.15, + "grad_norm": 0.9836358428001404, + "learning_rate": 9.635314449712852e-06, + "loss": 0.6658, + "step": 1161 + }, + { + "epoch": 0.15, + "grad_norm": 1.1989636421203613, + "learning_rate": 9.634536110360164e-06, + "loss": 0.6999, + "step": 1162 + }, + { + "epoch": 0.15, + "grad_norm": 1.0682141780853271, + "learning_rate": 9.633756972802832e-06, + "loss": 0.6808, + "step": 1163 + }, + { + "epoch": 0.15, + "grad_norm": 1.0833097696304321, + "learning_rate": 9.632977037175046e-06, + "loss": 0.6291, + "step": 1164 + }, + { + "epoch": 0.15, + "grad_norm": 1.168710470199585, + "learning_rate": 9.632196303611131e-06, + "loss": 0.721, + "step": 1165 + }, + { + "epoch": 0.15, + "grad_norm": 1.2726129293441772, + "learning_rate": 9.63141477224556e-06, + "loss": 0.6047, + "step": 1166 + }, + { + "epoch": 0.15, + "grad_norm": 1.4029903411865234, + "learning_rate": 9.63063244321293e-06, + "loss": 0.6346, + "step": 1167 + }, + { + "epoch": 0.15, + "grad_norm": 1.2573045492172241, + "learning_rate": 9.629849316647983e-06, + "loss": 0.5604, + "step": 1168 + }, + { + "epoch": 0.15, + "grad_norm": 1.3267803192138672, + "learning_rate": 9.629065392685594e-06, + "loss": 0.7928, + "step": 1169 + }, + { + "epoch": 0.15, + "grad_norm": 1.286224126815796, + "learning_rate": 9.628280671460784e-06, + "loss": 0.6406, + "step": 1170 + }, + { + "epoch": 0.15, + "grad_norm": 1.2955398559570312, + "learning_rate": 9.627495153108698e-06, + "loss": 0.6572, + "step": 1171 + }, + { + "epoch": 0.15, + "grad_norm": 1.24276864528656, + "learning_rate": 9.626708837764634e-06, + "loss": 0.6414, + "step": 1172 + }, + { + "epoch": 0.15, + "grad_norm": 1.2331914901733398, + "learning_rate": 9.62592172556401e-06, + "loss": 0.5933, + "step": 1173 + }, + { + "epoch": 0.15, + "grad_norm": 1.0565133094787598, + "learning_rate": 9.625133816642395e-06, + "loss": 0.7286, + "step": 1174 + }, + { + "epoch": 0.15, + "grad_norm": 1.4214918613433838, + "learning_rate": 9.624345111135492e-06, + "loss": 0.6858, + "step": 1175 + }, + { + "epoch": 0.15, + "grad_norm": 1.2732834815979004, + "learning_rate": 9.623555609179134e-06, + "loss": 0.6623, + "step": 1176 + }, + { + "epoch": 0.15, + "grad_norm": 1.498653769493103, + "learning_rate": 9.6227653109093e-06, + "loss": 0.6133, + "step": 1177 + }, + { + "epoch": 0.15, + "grad_norm": 1.5062004327774048, + "learning_rate": 9.621974216462101e-06, + "loss": 0.6736, + "step": 1178 + }, + { + "epoch": 0.15, + "grad_norm": 1.379712462425232, + "learning_rate": 9.621182325973792e-06, + "loss": 0.6587, + "step": 1179 + }, + { + "epoch": 0.15, + "grad_norm": 1.242719292640686, + "learning_rate": 9.620389639580753e-06, + "loss": 0.7805, + "step": 1180 + }, + { + "epoch": 0.15, + "grad_norm": 1.2989170551300049, + "learning_rate": 9.619596157419514e-06, + "loss": 0.6942, + "step": 1181 + }, + { + "epoch": 0.15, + "grad_norm": 1.712216854095459, + "learning_rate": 9.618801879626734e-06, + "loss": 0.6364, + "step": 1182 + }, + { + "epoch": 0.15, + "grad_norm": 2.460629940032959, + "learning_rate": 9.618006806339209e-06, + "loss": 0.6022, + "step": 1183 + }, + { + "epoch": 0.15, + "grad_norm": 1.011130928993225, + "learning_rate": 9.617210937693876e-06, + "loss": 0.6697, + "step": 1184 + }, + { + "epoch": 0.15, + "grad_norm": 0.9472834467887878, + "learning_rate": 9.616414273827808e-06, + "loss": 0.5716, + "step": 1185 + }, + { + "epoch": 0.15, + "grad_norm": 1.0987147092819214, + "learning_rate": 9.615616814878213e-06, + "loss": 0.6417, + "step": 1186 + }, + { + "epoch": 0.15, + "grad_norm": 1.320693850517273, + "learning_rate": 9.614818560982439e-06, + "loss": 0.6858, + "step": 1187 + }, + { + "epoch": 0.15, + "grad_norm": 1.2989826202392578, + "learning_rate": 9.614019512277966e-06, + "loss": 0.6196, + "step": 1188 + }, + { + "epoch": 0.15, + "grad_norm": 1.199810266494751, + "learning_rate": 9.613219668902417e-06, + "loss": 0.7115, + "step": 1189 + }, + { + "epoch": 0.15, + "grad_norm": 1.0295907258987427, + "learning_rate": 9.612419030993545e-06, + "loss": 0.7165, + "step": 1190 + }, + { + "epoch": 0.15, + "grad_norm": 1.7425963878631592, + "learning_rate": 9.611617598689248e-06, + "loss": 0.5757, + "step": 1191 + }, + { + "epoch": 0.15, + "grad_norm": 0.9693262577056885, + "learning_rate": 9.610815372127553e-06, + "loss": 0.534, + "step": 1192 + }, + { + "epoch": 0.15, + "grad_norm": 1.3094513416290283, + "learning_rate": 9.610012351446629e-06, + "loss": 0.5866, + "step": 1193 + }, + { + "epoch": 0.15, + "grad_norm": 1.1087934970855713, + "learning_rate": 9.60920853678478e-06, + "loss": 0.6526, + "step": 1194 + }, + { + "epoch": 0.15, + "grad_norm": 0.9788431525230408, + "learning_rate": 9.608403928280445e-06, + "loss": 0.5877, + "step": 1195 + }, + { + "epoch": 0.15, + "grad_norm": 1.0494896173477173, + "learning_rate": 9.607598526072203e-06, + "loss": 0.6029, + "step": 1196 + }, + { + "epoch": 0.15, + "grad_norm": 1.3554272651672363, + "learning_rate": 9.606792330298767e-06, + "loss": 0.6737, + "step": 1197 + }, + { + "epoch": 0.15, + "grad_norm": 1.2967965602874756, + "learning_rate": 9.60598534109899e-06, + "loss": 0.7037, + "step": 1198 + }, + { + "epoch": 0.15, + "grad_norm": 1.304180383682251, + "learning_rate": 9.605177558611859e-06, + "loss": 0.6131, + "step": 1199 + }, + { + "epoch": 0.15, + "grad_norm": 1.786306619644165, + "learning_rate": 9.604368982976495e-06, + "loss": 0.6623, + "step": 1200 + }, + { + "epoch": 0.15, + "grad_norm": 1.139390468597412, + "learning_rate": 9.603559614332162e-06, + "loss": 0.5915, + "step": 1201 + }, + { + "epoch": 0.15, + "grad_norm": 1.1185011863708496, + "learning_rate": 9.602749452818256e-06, + "loss": 0.5222, + "step": 1202 + }, + { + "epoch": 0.15, + "grad_norm": 1.157008409500122, + "learning_rate": 9.601938498574312e-06, + "loss": 0.6954, + "step": 1203 + }, + { + "epoch": 0.15, + "grad_norm": 2.68686842918396, + "learning_rate": 9.601126751739997e-06, + "loss": 0.6317, + "step": 1204 + }, + { + "epoch": 0.15, + "grad_norm": 1.0472536087036133, + "learning_rate": 9.600314212455125e-06, + "loss": 0.6819, + "step": 1205 + }, + { + "epoch": 0.15, + "grad_norm": 3.0865917205810547, + "learning_rate": 9.599500880859632e-06, + "loss": 0.572, + "step": 1206 + }, + { + "epoch": 0.15, + "grad_norm": 1.095560073852539, + "learning_rate": 9.5986867570936e-06, + "loss": 0.6667, + "step": 1207 + }, + { + "epoch": 0.15, + "grad_norm": 1.0914700031280518, + "learning_rate": 9.597871841297246e-06, + "loss": 0.6668, + "step": 1208 + }, + { + "epoch": 0.15, + "grad_norm": 1.1000221967697144, + "learning_rate": 9.597056133610923e-06, + "loss": 0.6105, + "step": 1209 + }, + { + "epoch": 0.16, + "grad_norm": 1.5570249557495117, + "learning_rate": 9.596239634175119e-06, + "loss": 0.6424, + "step": 1210 + }, + { + "epoch": 0.16, + "grad_norm": 1.3458069562911987, + "learning_rate": 9.595422343130462e-06, + "loss": 0.6504, + "step": 1211 + }, + { + "epoch": 0.16, + "grad_norm": 1.21104896068573, + "learning_rate": 9.59460426061771e-06, + "loss": 0.6194, + "step": 1212 + }, + { + "epoch": 0.16, + "grad_norm": 1.3381779193878174, + "learning_rate": 9.593785386777763e-06, + "loss": 0.5563, + "step": 1213 + }, + { + "epoch": 0.16, + "grad_norm": 1.3582426309585571, + "learning_rate": 9.592965721751657e-06, + "loss": 0.5941, + "step": 1214 + }, + { + "epoch": 0.16, + "grad_norm": 1.5648891925811768, + "learning_rate": 9.59214526568056e-06, + "loss": 0.5831, + "step": 1215 + }, + { + "epoch": 0.16, + "grad_norm": 1.3309880495071411, + "learning_rate": 9.591324018705779e-06, + "loss": 0.5284, + "step": 1216 + }, + { + "epoch": 0.16, + "grad_norm": 1.4098135232925415, + "learning_rate": 9.590501980968758e-06, + "loss": 0.7147, + "step": 1217 + }, + { + "epoch": 0.16, + "grad_norm": 1.2456073760986328, + "learning_rate": 9.589679152611078e-06, + "loss": 0.5923, + "step": 1218 + }, + { + "epoch": 0.16, + "grad_norm": 1.2749463319778442, + "learning_rate": 9.58885553377445e-06, + "loss": 0.6064, + "step": 1219 + }, + { + "epoch": 0.16, + "grad_norm": 1.382809042930603, + "learning_rate": 9.58803112460073e-06, + "loss": 0.6162, + "step": 1220 + }, + { + "epoch": 0.16, + "grad_norm": 1.5532960891723633, + "learning_rate": 9.587205925231904e-06, + "loss": 0.6222, + "step": 1221 + }, + { + "epoch": 0.16, + "grad_norm": 1.1475423574447632, + "learning_rate": 9.586379935810096e-06, + "loss": 0.5918, + "step": 1222 + }, + { + "epoch": 0.16, + "grad_norm": 1.1791549921035767, + "learning_rate": 9.585553156477565e-06, + "loss": 0.5603, + "step": 1223 + }, + { + "epoch": 0.16, + "grad_norm": 1.2540336847305298, + "learning_rate": 9.584725587376708e-06, + "loss": 0.6537, + "step": 1224 + }, + { + "epoch": 0.16, + "grad_norm": 1.3627375364303589, + "learning_rate": 9.583897228650056e-06, + "loss": 0.6247, + "step": 1225 + }, + { + "epoch": 0.16, + "grad_norm": 1.172321081161499, + "learning_rate": 9.583068080440278e-06, + "loss": 0.7451, + "step": 1226 + }, + { + "epoch": 0.16, + "grad_norm": 1.1737582683563232, + "learning_rate": 9.582238142890179e-06, + "loss": 0.6437, + "step": 1227 + }, + { + "epoch": 0.16, + "grad_norm": 1.0629295110702515, + "learning_rate": 9.581407416142696e-06, + "loss": 0.6622, + "step": 1228 + }, + { + "epoch": 0.16, + "grad_norm": 1.3306161165237427, + "learning_rate": 9.580575900340904e-06, + "loss": 0.7027, + "step": 1229 + }, + { + "epoch": 0.16, + "grad_norm": 1.119742512702942, + "learning_rate": 9.57974359562802e-06, + "loss": 0.6573, + "step": 1230 + }, + { + "epoch": 0.16, + "grad_norm": 1.1853952407836914, + "learning_rate": 9.578910502147387e-06, + "loss": 0.7601, + "step": 1231 + }, + { + "epoch": 0.16, + "grad_norm": 2.302206516265869, + "learning_rate": 9.578076620042492e-06, + "loss": 0.616, + "step": 1232 + }, + { + "epoch": 0.16, + "grad_norm": 1.1335538625717163, + "learning_rate": 9.57724194945695e-06, + "loss": 0.5944, + "step": 1233 + }, + { + "epoch": 0.16, + "grad_norm": 1.0384563207626343, + "learning_rate": 9.576406490534518e-06, + "loss": 0.6307, + "step": 1234 + }, + { + "epoch": 0.16, + "grad_norm": 1.1229547262191772, + "learning_rate": 9.575570243419087e-06, + "loss": 0.585, + "step": 1235 + }, + { + "epoch": 0.16, + "grad_norm": 1.213402271270752, + "learning_rate": 9.574733208254684e-06, + "loss": 0.5698, + "step": 1236 + }, + { + "epoch": 0.16, + "grad_norm": 1.1010196208953857, + "learning_rate": 9.57389538518547e-06, + "loss": 0.593, + "step": 1237 + }, + { + "epoch": 0.16, + "grad_norm": 1.357301115989685, + "learning_rate": 9.573056774355745e-06, + "loss": 0.6061, + "step": 1238 + }, + { + "epoch": 0.16, + "grad_norm": 1.095462679862976, + "learning_rate": 9.57221737590994e-06, + "loss": 0.6389, + "step": 1239 + }, + { + "epoch": 0.16, + "grad_norm": 1.4295891523361206, + "learning_rate": 9.571377189992627e-06, + "loss": 0.6627, + "step": 1240 + }, + { + "epoch": 0.16, + "grad_norm": 1.268510341644287, + "learning_rate": 9.570536216748509e-06, + "loss": 0.6064, + "step": 1241 + }, + { + "epoch": 0.16, + "grad_norm": 1.1977204084396362, + "learning_rate": 9.569694456322428e-06, + "loss": 0.6375, + "step": 1242 + }, + { + "epoch": 0.16, + "grad_norm": 1.0961164236068726, + "learning_rate": 9.56885190885936e-06, + "loss": 0.6977, + "step": 1243 + }, + { + "epoch": 0.16, + "grad_norm": 1.396316409111023, + "learning_rate": 9.568008574504415e-06, + "loss": 0.6978, + "step": 1244 + }, + { + "epoch": 0.16, + "grad_norm": 2.1277964115142822, + "learning_rate": 9.56716445340284e-06, + "loss": 0.5666, + "step": 1245 + }, + { + "epoch": 0.16, + "grad_norm": 1.1821579933166504, + "learning_rate": 9.566319545700021e-06, + "loss": 0.6249, + "step": 1246 + }, + { + "epoch": 0.16, + "grad_norm": 1.187343955039978, + "learning_rate": 9.565473851541473e-06, + "loss": 0.6518, + "step": 1247 + }, + { + "epoch": 0.16, + "grad_norm": 1.382359504699707, + "learning_rate": 9.564627371072853e-06, + "loss": 0.6206, + "step": 1248 + }, + { + "epoch": 0.16, + "grad_norm": 1.0749714374542236, + "learning_rate": 9.563780104439945e-06, + "loss": 0.6691, + "step": 1249 + }, + { + "epoch": 0.16, + "grad_norm": 1.6738566160202026, + "learning_rate": 9.562932051788677e-06, + "loss": 0.6744, + "step": 1250 + }, + { + "epoch": 0.16, + "grad_norm": 1.1276246309280396, + "learning_rate": 9.56208321326511e-06, + "loss": 0.6447, + "step": 1251 + }, + { + "epoch": 0.16, + "grad_norm": 1.1615641117095947, + "learning_rate": 9.561233589015435e-06, + "loss": 0.4971, + "step": 1252 + }, + { + "epoch": 0.16, + "grad_norm": 1.220762848854065, + "learning_rate": 9.560383179185986e-06, + "loss": 0.6402, + "step": 1253 + }, + { + "epoch": 0.16, + "grad_norm": 1.1979705095291138, + "learning_rate": 9.559531983923226e-06, + "loss": 0.6028, + "step": 1254 + }, + { + "epoch": 0.16, + "grad_norm": 1.3428279161453247, + "learning_rate": 9.558680003373762e-06, + "loss": 0.65, + "step": 1255 + }, + { + "epoch": 0.16, + "grad_norm": 1.0979018211364746, + "learning_rate": 9.557827237684324e-06, + "loss": 0.582, + "step": 1256 + }, + { + "epoch": 0.16, + "grad_norm": 1.1736618280410767, + "learning_rate": 9.556973687001782e-06, + "loss": 0.6034, + "step": 1257 + }, + { + "epoch": 0.16, + "grad_norm": 1.021039366722107, + "learning_rate": 9.55611935147315e-06, + "loss": 0.5749, + "step": 1258 + }, + { + "epoch": 0.16, + "grad_norm": 1.1807692050933838, + "learning_rate": 9.555264231245566e-06, + "loss": 0.652, + "step": 1259 + }, + { + "epoch": 0.16, + "grad_norm": 0.9832674264907837, + "learning_rate": 9.554408326466306e-06, + "loss": 0.6777, + "step": 1260 + }, + { + "epoch": 0.16, + "grad_norm": 1.0194718837738037, + "learning_rate": 9.553551637282785e-06, + "loss": 0.6897, + "step": 1261 + }, + { + "epoch": 0.16, + "grad_norm": 1.3178449869155884, + "learning_rate": 9.552694163842548e-06, + "loss": 0.6381, + "step": 1262 + }, + { + "epoch": 0.16, + "grad_norm": 1.1459720134735107, + "learning_rate": 9.55183590629328e-06, + "loss": 0.6509, + "step": 1263 + }, + { + "epoch": 0.16, + "grad_norm": 1.71978759765625, + "learning_rate": 9.550976864782792e-06, + "loss": 0.6781, + "step": 1264 + }, + { + "epoch": 0.16, + "grad_norm": 1.1249512434005737, + "learning_rate": 9.550117039459045e-06, + "loss": 0.5807, + "step": 1265 + }, + { + "epoch": 0.16, + "grad_norm": 1.0362927913665771, + "learning_rate": 9.549256430470123e-06, + "loss": 0.6622, + "step": 1266 + }, + { + "epoch": 0.16, + "grad_norm": 1.005173683166504, + "learning_rate": 9.548395037964247e-06, + "loss": 0.6316, + "step": 1267 + }, + { + "epoch": 0.16, + "grad_norm": 1.2679580450057983, + "learning_rate": 9.547532862089775e-06, + "loss": 0.5937, + "step": 1268 + }, + { + "epoch": 0.16, + "grad_norm": 1.0311354398727417, + "learning_rate": 9.5466699029952e-06, + "loss": 0.5042, + "step": 1269 + }, + { + "epoch": 0.16, + "grad_norm": 0.9762603044509888, + "learning_rate": 9.545806160829147e-06, + "loss": 0.6358, + "step": 1270 + }, + { + "epoch": 0.16, + "grad_norm": 0.9036137461662292, + "learning_rate": 9.544941635740382e-06, + "loss": 0.6065, + "step": 1271 + }, + { + "epoch": 0.16, + "grad_norm": 1.0758832693099976, + "learning_rate": 9.544076327877799e-06, + "loss": 0.5666, + "step": 1272 + }, + { + "epoch": 0.16, + "grad_norm": 1.2641098499298096, + "learning_rate": 9.543210237390429e-06, + "loss": 0.6395, + "step": 1273 + }, + { + "epoch": 0.16, + "grad_norm": 1.3716449737548828, + "learning_rate": 9.54234336442744e-06, + "loss": 0.7927, + "step": 1274 + }, + { + "epoch": 0.16, + "grad_norm": 1.1750115156173706, + "learning_rate": 9.541475709138135e-06, + "loss": 0.5781, + "step": 1275 + }, + { + "epoch": 0.16, + "grad_norm": 1.270869493484497, + "learning_rate": 9.540607271671947e-06, + "loss": 0.7835, + "step": 1276 + }, + { + "epoch": 0.16, + "grad_norm": 1.7524586915969849, + "learning_rate": 9.539738052178446e-06, + "loss": 0.6404, + "step": 1277 + }, + { + "epoch": 0.16, + "grad_norm": 1.035844326019287, + "learning_rate": 9.538868050807342e-06, + "loss": 0.6465, + "step": 1278 + }, + { + "epoch": 0.16, + "grad_norm": 1.166962742805481, + "learning_rate": 9.53799726770847e-06, + "loss": 0.5387, + "step": 1279 + }, + { + "epoch": 0.16, + "grad_norm": 1.160826563835144, + "learning_rate": 9.537125703031809e-06, + "loss": 0.6005, + "step": 1280 + }, + { + "epoch": 0.16, + "grad_norm": 1.0476465225219727, + "learning_rate": 9.536253356927465e-06, + "loss": 0.6716, + "step": 1281 + }, + { + "epoch": 0.16, + "grad_norm": 1.0793278217315674, + "learning_rate": 9.535380229545684e-06, + "loss": 0.7254, + "step": 1282 + }, + { + "epoch": 0.16, + "grad_norm": 1.1872631311416626, + "learning_rate": 9.534506321036842e-06, + "loss": 0.6274, + "step": 1283 + }, + { + "epoch": 0.16, + "grad_norm": 1.5636075735092163, + "learning_rate": 9.533631631551455e-06, + "loss": 0.6149, + "step": 1284 + }, + { + "epoch": 0.16, + "grad_norm": 1.2059067487716675, + "learning_rate": 9.532756161240166e-06, + "loss": 0.6378, + "step": 1285 + }, + { + "epoch": 0.16, + "grad_norm": 1.154067039489746, + "learning_rate": 9.531879910253762e-06, + "loss": 0.7198, + "step": 1286 + }, + { + "epoch": 0.16, + "grad_norm": 1.2851213216781616, + "learning_rate": 9.531002878743158e-06, + "loss": 0.6087, + "step": 1287 + }, + { + "epoch": 0.17, + "grad_norm": 1.1828138828277588, + "learning_rate": 9.530125066859404e-06, + "loss": 0.7095, + "step": 1288 + }, + { + "epoch": 0.17, + "grad_norm": 1.218656063079834, + "learning_rate": 9.529246474753684e-06, + "loss": 0.6411, + "step": 1289 + }, + { + "epoch": 0.17, + "grad_norm": 1.1605652570724487, + "learning_rate": 9.528367102577321e-06, + "loss": 0.6413, + "step": 1290 + }, + { + "epoch": 0.17, + "grad_norm": 1.1710295677185059, + "learning_rate": 9.527486950481765e-06, + "loss": 0.6295, + "step": 1291 + }, + { + "epoch": 0.17, + "grad_norm": 0.972027063369751, + "learning_rate": 9.526606018618608e-06, + "loss": 0.5839, + "step": 1292 + }, + { + "epoch": 0.17, + "grad_norm": 1.12654709815979, + "learning_rate": 9.52572430713957e-06, + "loss": 0.5989, + "step": 1293 + }, + { + "epoch": 0.17, + "grad_norm": 1.2400918006896973, + "learning_rate": 9.52484181619651e-06, + "loss": 0.5958, + "step": 1294 + }, + { + "epoch": 0.17, + "grad_norm": 1.2699034214019775, + "learning_rate": 9.523958545941417e-06, + "loss": 0.6808, + "step": 1295 + }, + { + "epoch": 0.17, + "grad_norm": 1.1967825889587402, + "learning_rate": 9.523074496526418e-06, + "loss": 0.6116, + "step": 1296 + }, + { + "epoch": 0.17, + "grad_norm": 1.2007198333740234, + "learning_rate": 9.522189668103771e-06, + "loss": 0.564, + "step": 1297 + }, + { + "epoch": 0.17, + "grad_norm": 1.3140076398849487, + "learning_rate": 9.521304060825872e-06, + "loss": 0.6417, + "step": 1298 + }, + { + "epoch": 0.17, + "grad_norm": 1.6398093700408936, + "learning_rate": 9.520417674845246e-06, + "loss": 0.6366, + "step": 1299 + }, + { + "epoch": 0.17, + "grad_norm": 1.0856890678405762, + "learning_rate": 9.519530510314558e-06, + "loss": 0.6931, + "step": 1300 + }, + { + "epoch": 0.17, + "grad_norm": 1.219240665435791, + "learning_rate": 9.518642567386603e-06, + "loss": 0.6608, + "step": 1301 + }, + { + "epoch": 0.17, + "grad_norm": 1.2921411991119385, + "learning_rate": 9.517753846214309e-06, + "loss": 0.6584, + "step": 1302 + }, + { + "epoch": 0.17, + "grad_norm": 1.0977832078933716, + "learning_rate": 9.51686434695074e-06, + "loss": 0.6219, + "step": 1303 + }, + { + "epoch": 0.17, + "grad_norm": 1.0890233516693115, + "learning_rate": 9.515974069749098e-06, + "loss": 0.6462, + "step": 1304 + }, + { + "epoch": 0.17, + "grad_norm": 1.3405479192733765, + "learning_rate": 9.515083014762714e-06, + "loss": 0.5702, + "step": 1305 + }, + { + "epoch": 0.17, + "grad_norm": 0.9640637636184692, + "learning_rate": 9.514191182145052e-06, + "loss": 0.6735, + "step": 1306 + }, + { + "epoch": 0.17, + "grad_norm": 1.3195781707763672, + "learning_rate": 9.513298572049714e-06, + "loss": 0.6042, + "step": 1307 + }, + { + "epoch": 0.17, + "grad_norm": 1.054443120956421, + "learning_rate": 9.512405184630432e-06, + "loss": 0.6659, + "step": 1308 + }, + { + "epoch": 0.17, + "grad_norm": 2.662334442138672, + "learning_rate": 9.511511020041076e-06, + "loss": 0.5981, + "step": 1309 + }, + { + "epoch": 0.17, + "grad_norm": 1.3885622024536133, + "learning_rate": 9.510616078435646e-06, + "loss": 0.7479, + "step": 1310 + }, + { + "epoch": 0.17, + "grad_norm": 1.1111118793487549, + "learning_rate": 9.509720359968279e-06, + "loss": 0.6262, + "step": 1311 + }, + { + "epoch": 0.17, + "grad_norm": 1.2182552814483643, + "learning_rate": 9.508823864793242e-06, + "loss": 0.6059, + "step": 1312 + }, + { + "epoch": 0.17, + "grad_norm": 1.1193416118621826, + "learning_rate": 9.507926593064941e-06, + "loss": 0.6478, + "step": 1313 + }, + { + "epoch": 0.17, + "grad_norm": 1.3522484302520752, + "learning_rate": 9.50702854493791e-06, + "loss": 0.6145, + "step": 1314 + }, + { + "epoch": 0.17, + "grad_norm": 1.071323275566101, + "learning_rate": 9.506129720566823e-06, + "loss": 0.6033, + "step": 1315 + }, + { + "epoch": 0.17, + "grad_norm": 1.270236611366272, + "learning_rate": 9.50523012010648e-06, + "loss": 0.5783, + "step": 1316 + }, + { + "epoch": 0.17, + "grad_norm": 1.0845481157302856, + "learning_rate": 9.504329743711822e-06, + "loss": 0.6448, + "step": 1317 + }, + { + "epoch": 0.17, + "grad_norm": 1.229878544807434, + "learning_rate": 9.50342859153792e-06, + "loss": 0.6432, + "step": 1318 + }, + { + "epoch": 0.17, + "grad_norm": 1.2673510313034058, + "learning_rate": 9.50252666373998e-06, + "loss": 0.6371, + "step": 1319 + }, + { + "epoch": 0.17, + "grad_norm": 1.5885368585586548, + "learning_rate": 9.501623960473336e-06, + "loss": 0.669, + "step": 1320 + }, + { + "epoch": 0.17, + "grad_norm": 1.237269401550293, + "learning_rate": 9.500720481893466e-06, + "loss": 0.6404, + "step": 1321 + }, + { + "epoch": 0.17, + "grad_norm": 1.3405085802078247, + "learning_rate": 9.499816228155972e-06, + "loss": 0.625, + "step": 1322 + }, + { + "epoch": 0.17, + "grad_norm": 0.9338662624359131, + "learning_rate": 9.498911199416596e-06, + "loss": 0.5723, + "step": 1323 + }, + { + "epoch": 0.17, + "grad_norm": 1.9581477642059326, + "learning_rate": 9.49800539583121e-06, + "loss": 0.5658, + "step": 1324 + }, + { + "epoch": 0.17, + "grad_norm": 1.2048728466033936, + "learning_rate": 9.497098817555818e-06, + "loss": 0.628, + "step": 1325 + }, + { + "epoch": 0.17, + "grad_norm": 1.456356406211853, + "learning_rate": 9.496191464746564e-06, + "loss": 0.6503, + "step": 1326 + }, + { + "epoch": 0.17, + "grad_norm": 1.1231420040130615, + "learning_rate": 9.495283337559718e-06, + "loss": 0.5696, + "step": 1327 + }, + { + "epoch": 0.17, + "grad_norm": 1.0166943073272705, + "learning_rate": 9.494374436151688e-06, + "loss": 0.63, + "step": 1328 + }, + { + "epoch": 0.17, + "grad_norm": 1.5634657144546509, + "learning_rate": 9.493464760679012e-06, + "loss": 0.6734, + "step": 1329 + }, + { + "epoch": 0.17, + "grad_norm": 1.0731712579727173, + "learning_rate": 9.492554311298363e-06, + "loss": 0.6909, + "step": 1330 + }, + { + "epoch": 0.17, + "grad_norm": 1.0333720445632935, + "learning_rate": 9.49164308816655e-06, + "loss": 0.6646, + "step": 1331 + }, + { + "epoch": 0.17, + "grad_norm": 0.9412813186645508, + "learning_rate": 9.49073109144051e-06, + "loss": 0.6295, + "step": 1332 + }, + { + "epoch": 0.17, + "grad_norm": 1.3674019575119019, + "learning_rate": 9.489818321277318e-06, + "loss": 0.6832, + "step": 1333 + }, + { + "epoch": 0.17, + "grad_norm": 1.2104706764221191, + "learning_rate": 9.488904777834178e-06, + "loss": 0.6606, + "step": 1334 + }, + { + "epoch": 0.17, + "grad_norm": 1.2919623851776123, + "learning_rate": 9.487990461268431e-06, + "loss": 0.6844, + "step": 1335 + }, + { + "epoch": 0.17, + "grad_norm": 1.6904457807540894, + "learning_rate": 9.48707537173755e-06, + "loss": 0.6159, + "step": 1336 + }, + { + "epoch": 0.17, + "grad_norm": 0.9881489872932434, + "learning_rate": 9.486159509399138e-06, + "loss": 0.5704, + "step": 1337 + }, + { + "epoch": 0.17, + "grad_norm": 1.1692382097244263, + "learning_rate": 9.485242874410936e-06, + "loss": 0.6502, + "step": 1338 + }, + { + "epoch": 0.17, + "grad_norm": 1.3247543573379517, + "learning_rate": 9.484325466930814e-06, + "loss": 0.6244, + "step": 1339 + }, + { + "epoch": 0.17, + "grad_norm": 1.2117483615875244, + "learning_rate": 9.483407287116777e-06, + "loss": 0.5659, + "step": 1340 + }, + { + "epoch": 0.17, + "grad_norm": 0.9740604162216187, + "learning_rate": 9.482488335126964e-06, + "loss": 0.6486, + "step": 1341 + }, + { + "epoch": 0.17, + "grad_norm": 1.1840041875839233, + "learning_rate": 9.481568611119647e-06, + "loss": 0.7439, + "step": 1342 + }, + { + "epoch": 0.17, + "grad_norm": 1.2369718551635742, + "learning_rate": 9.480648115253225e-06, + "loss": 0.5795, + "step": 1343 + }, + { + "epoch": 0.17, + "grad_norm": 1.4914673566818237, + "learning_rate": 9.479726847686241e-06, + "loss": 0.5856, + "step": 1344 + }, + { + "epoch": 0.17, + "grad_norm": 1.54400634765625, + "learning_rate": 9.478804808577359e-06, + "loss": 0.6174, + "step": 1345 + }, + { + "epoch": 0.17, + "grad_norm": 1.0934762954711914, + "learning_rate": 9.477881998085386e-06, + "loss": 0.6775, + "step": 1346 + }, + { + "epoch": 0.17, + "grad_norm": 1.1581801176071167, + "learning_rate": 9.476958416369253e-06, + "loss": 0.619, + "step": 1347 + }, + { + "epoch": 0.17, + "grad_norm": 1.2244946956634521, + "learning_rate": 9.47603406358803e-06, + "loss": 0.5794, + "step": 1348 + }, + { + "epoch": 0.17, + "grad_norm": 1.1591740846633911, + "learning_rate": 9.47510893990092e-06, + "loss": 0.6477, + "step": 1349 + }, + { + "epoch": 0.17, + "grad_norm": 1.1715471744537354, + "learning_rate": 9.474183045467255e-06, + "loss": 0.6397, + "step": 1350 + }, + { + "epoch": 0.17, + "grad_norm": 1.2296302318572998, + "learning_rate": 9.473256380446501e-06, + "loss": 0.7369, + "step": 1351 + }, + { + "epoch": 0.17, + "grad_norm": 1.7023544311523438, + "learning_rate": 9.472328944998256e-06, + "loss": 0.53, + "step": 1352 + }, + { + "epoch": 0.17, + "grad_norm": 1.0957367420196533, + "learning_rate": 9.471400739282258e-06, + "loss": 0.5876, + "step": 1353 + }, + { + "epoch": 0.17, + "grad_norm": 1.7862164974212646, + "learning_rate": 9.470471763458364e-06, + "loss": 0.666, + "step": 1354 + }, + { + "epoch": 0.17, + "grad_norm": 1.7836625576019287, + "learning_rate": 9.469542017686574e-06, + "loss": 0.6425, + "step": 1355 + }, + { + "epoch": 0.17, + "grad_norm": 1.832993984222412, + "learning_rate": 9.468611502127021e-06, + "loss": 0.5768, + "step": 1356 + }, + { + "epoch": 0.17, + "grad_norm": 1.1718406677246094, + "learning_rate": 9.467680216939964e-06, + "loss": 0.5895, + "step": 1357 + }, + { + "epoch": 0.17, + "grad_norm": 1.15514075756073, + "learning_rate": 9.466748162285797e-06, + "loss": 0.5945, + "step": 1358 + }, + { + "epoch": 0.17, + "grad_norm": 1.2014521360397339, + "learning_rate": 9.46581533832505e-06, + "loss": 0.6036, + "step": 1359 + }, + { + "epoch": 0.17, + "grad_norm": 1.222567081451416, + "learning_rate": 9.464881745218382e-06, + "loss": 0.6194, + "step": 1360 + }, + { + "epoch": 0.17, + "grad_norm": 1.3445652723312378, + "learning_rate": 9.463947383126586e-06, + "loss": 0.6184, + "step": 1361 + }, + { + "epoch": 0.17, + "grad_norm": 1.1077783107757568, + "learning_rate": 9.463012252210586e-06, + "loss": 0.6802, + "step": 1362 + }, + { + "epoch": 0.17, + "grad_norm": 1.173924446105957, + "learning_rate": 9.46207635263144e-06, + "loss": 0.6404, + "step": 1363 + }, + { + "epoch": 0.17, + "grad_norm": 1.214298129081726, + "learning_rate": 9.461139684550335e-06, + "loss": 0.6512, + "step": 1364 + }, + { + "epoch": 0.17, + "grad_norm": 1.7848788499832153, + "learning_rate": 9.460202248128598e-06, + "loss": 0.5918, + "step": 1365 + }, + { + "epoch": 0.18, + "grad_norm": 1.0078115463256836, + "learning_rate": 9.45926404352768e-06, + "loss": 0.517, + "step": 1366 + }, + { + "epoch": 0.18, + "grad_norm": 1.179234504699707, + "learning_rate": 9.458325070909169e-06, + "loss": 0.7112, + "step": 1367 + }, + { + "epoch": 0.18, + "grad_norm": 1.8004214763641357, + "learning_rate": 9.457385330434782e-06, + "loss": 0.6046, + "step": 1368 + }, + { + "epoch": 0.18, + "grad_norm": 1.1646499633789062, + "learning_rate": 9.456444822266373e-06, + "loss": 0.5833, + "step": 1369 + }, + { + "epoch": 0.18, + "grad_norm": 1.1174657344818115, + "learning_rate": 9.455503546565923e-06, + "loss": 0.6349, + "step": 1370 + }, + { + "epoch": 0.18, + "grad_norm": 1.5000091791152954, + "learning_rate": 9.45456150349555e-06, + "loss": 0.6107, + "step": 1371 + }, + { + "epoch": 0.18, + "grad_norm": 1.1921900510787964, + "learning_rate": 9.453618693217498e-06, + "loss": 0.6262, + "step": 1372 + }, + { + "epoch": 0.18, + "grad_norm": 1.1785988807678223, + "learning_rate": 9.452675115894151e-06, + "loss": 0.6661, + "step": 1373 + }, + { + "epoch": 0.18, + "grad_norm": 1.1983801126480103, + "learning_rate": 9.45173077168802e-06, + "loss": 0.6726, + "step": 1374 + }, + { + "epoch": 0.18, + "grad_norm": 1.2910903692245483, + "learning_rate": 9.450785660761747e-06, + "loss": 0.6717, + "step": 1375 + }, + { + "epoch": 0.18, + "grad_norm": 1.1190389394760132, + "learning_rate": 9.44983978327811e-06, + "loss": 0.6953, + "step": 1376 + }, + { + "epoch": 0.18, + "grad_norm": 1.6207910776138306, + "learning_rate": 9.448893139400016e-06, + "loss": 0.6883, + "step": 1377 + }, + { + "epoch": 0.18, + "grad_norm": 1.0816600322723389, + "learning_rate": 9.447945729290507e-06, + "loss": 0.6447, + "step": 1378 + }, + { + "epoch": 0.18, + "grad_norm": 2.4702744483947754, + "learning_rate": 9.446997553112753e-06, + "loss": 0.5832, + "step": 1379 + }, + { + "epoch": 0.18, + "grad_norm": 0.8984103202819824, + "learning_rate": 9.446048611030061e-06, + "loss": 0.6695, + "step": 1380 + }, + { + "epoch": 0.18, + "grad_norm": 1.11966073513031, + "learning_rate": 9.445098903205863e-06, + "loss": 0.6151, + "step": 1381 + }, + { + "epoch": 0.18, + "grad_norm": 1.413903832435608, + "learning_rate": 9.44414842980373e-06, + "loss": 0.5619, + "step": 1382 + }, + { + "epoch": 0.18, + "grad_norm": 1.174336314201355, + "learning_rate": 9.443197190987359e-06, + "loss": 0.7763, + "step": 1383 + }, + { + "epoch": 0.18, + "grad_norm": 1.1478570699691772, + "learning_rate": 9.442245186920585e-06, + "loss": 0.5953, + "step": 1384 + }, + { + "epoch": 0.18, + "grad_norm": 1.0508737564086914, + "learning_rate": 9.44129241776737e-06, + "loss": 0.6232, + "step": 1385 + }, + { + "epoch": 0.18, + "grad_norm": 1.4047367572784424, + "learning_rate": 9.440338883691807e-06, + "loss": 0.611, + "step": 1386 + }, + { + "epoch": 0.18, + "grad_norm": 1.1691255569458008, + "learning_rate": 9.439384584858125e-06, + "loss": 0.6286, + "step": 1387 + }, + { + "epoch": 0.18, + "grad_norm": 1.2848888635635376, + "learning_rate": 9.43842952143068e-06, + "loss": 0.6801, + "step": 1388 + }, + { + "epoch": 0.18, + "grad_norm": 1.1584075689315796, + "learning_rate": 9.437473693573969e-06, + "loss": 0.6356, + "step": 1389 + }, + { + "epoch": 0.18, + "grad_norm": 1.8564170598983765, + "learning_rate": 9.436517101452607e-06, + "loss": 0.6309, + "step": 1390 + }, + { + "epoch": 0.18, + "grad_norm": 1.3005411624908447, + "learning_rate": 9.435559745231348e-06, + "loss": 0.6847, + "step": 1391 + }, + { + "epoch": 0.18, + "grad_norm": 1.392625331878662, + "learning_rate": 9.434601625075082e-06, + "loss": 0.5639, + "step": 1392 + }, + { + "epoch": 0.18, + "grad_norm": 1.3917368650436401, + "learning_rate": 9.43364274114882e-06, + "loss": 0.7253, + "step": 1393 + }, + { + "epoch": 0.18, + "grad_norm": 1.6492226123809814, + "learning_rate": 9.432683093617716e-06, + "loss": 0.6228, + "step": 1394 + }, + { + "epoch": 0.18, + "grad_norm": 1.330742359161377, + "learning_rate": 9.431722682647044e-06, + "loss": 0.7072, + "step": 1395 + }, + { + "epoch": 0.18, + "grad_norm": 1.6217615604400635, + "learning_rate": 9.43076150840222e-06, + "loss": 0.5747, + "step": 1396 + }, + { + "epoch": 0.18, + "grad_norm": 1.2197303771972656, + "learning_rate": 9.429799571048784e-06, + "loss": 0.6557, + "step": 1397 + }, + { + "epoch": 0.18, + "grad_norm": 1.0048884153366089, + "learning_rate": 9.428836870752411e-06, + "loss": 0.707, + "step": 1398 + }, + { + "epoch": 0.18, + "grad_norm": 1.2903577089309692, + "learning_rate": 9.427873407678907e-06, + "loss": 0.7498, + "step": 1399 + }, + { + "epoch": 0.18, + "grad_norm": 1.230726718902588, + "learning_rate": 9.426909181994208e-06, + "loss": 0.6526, + "step": 1400 + }, + { + "epoch": 0.18, + "grad_norm": 1.1689950227737427, + "learning_rate": 9.425944193864382e-06, + "loss": 0.6607, + "step": 1401 + }, + { + "epoch": 0.18, + "grad_norm": 1.2050648927688599, + "learning_rate": 9.42497844345563e-06, + "loss": 0.7587, + "step": 1402 + }, + { + "epoch": 0.18, + "grad_norm": 1.1159378290176392, + "learning_rate": 9.424011930934283e-06, + "loss": 0.6073, + "step": 1403 + }, + { + "epoch": 0.18, + "grad_norm": 1.159040927886963, + "learning_rate": 9.4230446564668e-06, + "loss": 0.6005, + "step": 1404 + }, + { + "epoch": 0.18, + "grad_norm": 1.3513133525848389, + "learning_rate": 9.422076620219777e-06, + "loss": 0.6888, + "step": 1405 + }, + { + "epoch": 0.18, + "grad_norm": 1.4324787855148315, + "learning_rate": 9.42110782235994e-06, + "loss": 0.6781, + "step": 1406 + }, + { + "epoch": 0.18, + "grad_norm": 1.0883930921554565, + "learning_rate": 9.420138263054143e-06, + "loss": 0.6243, + "step": 1407 + }, + { + "epoch": 0.18, + "grad_norm": 1.2407677173614502, + "learning_rate": 9.419167942469372e-06, + "loss": 0.6807, + "step": 1408 + }, + { + "epoch": 0.18, + "grad_norm": 1.275864839553833, + "learning_rate": 9.418196860772746e-06, + "loss": 0.6301, + "step": 1409 + }, + { + "epoch": 0.18, + "grad_norm": 2.2127833366394043, + "learning_rate": 9.417225018131513e-06, + "loss": 0.6004, + "step": 1410 + }, + { + "epoch": 0.18, + "grad_norm": 1.683578610420227, + "learning_rate": 9.416252414713056e-06, + "loss": 0.5999, + "step": 1411 + }, + { + "epoch": 0.18, + "grad_norm": 1.1180976629257202, + "learning_rate": 9.415279050684882e-06, + "loss": 0.6629, + "step": 1412 + }, + { + "epoch": 0.18, + "grad_norm": 1.122758388519287, + "learning_rate": 9.414304926214637e-06, + "loss": 0.582, + "step": 1413 + }, + { + "epoch": 0.18, + "grad_norm": 1.2862956523895264, + "learning_rate": 9.413330041470092e-06, + "loss": 0.5903, + "step": 1414 + }, + { + "epoch": 0.18, + "grad_norm": 1.222602367401123, + "learning_rate": 9.412354396619151e-06, + "loss": 0.6605, + "step": 1415 + }, + { + "epoch": 0.18, + "grad_norm": 1.1338920593261719, + "learning_rate": 9.411377991829851e-06, + "loss": 0.5989, + "step": 1416 + }, + { + "epoch": 0.18, + "grad_norm": 1.2186603546142578, + "learning_rate": 9.410400827270356e-06, + "loss": 0.669, + "step": 1417 + }, + { + "epoch": 0.18, + "grad_norm": 1.038238763809204, + "learning_rate": 9.409422903108963e-06, + "loss": 0.6425, + "step": 1418 + }, + { + "epoch": 0.18, + "grad_norm": 1.0504441261291504, + "learning_rate": 9.4084442195141e-06, + "loss": 0.5588, + "step": 1419 + }, + { + "epoch": 0.18, + "grad_norm": 1.3752822875976562, + "learning_rate": 9.407464776654326e-06, + "loss": 0.5757, + "step": 1420 + }, + { + "epoch": 0.18, + "grad_norm": 1.1799776554107666, + "learning_rate": 9.406484574698328e-06, + "loss": 0.7499, + "step": 1421 + }, + { + "epoch": 0.18, + "grad_norm": 1.192615270614624, + "learning_rate": 9.405503613814927e-06, + "loss": 0.4951, + "step": 1422 + }, + { + "epoch": 0.18, + "grad_norm": 1.10984206199646, + "learning_rate": 9.404521894173075e-06, + "loss": 0.5502, + "step": 1423 + }, + { + "epoch": 0.18, + "grad_norm": 1.2776201963424683, + "learning_rate": 9.403539415941852e-06, + "loss": 0.7015, + "step": 1424 + }, + { + "epoch": 0.18, + "grad_norm": 1.2026793956756592, + "learning_rate": 9.402556179290468e-06, + "loss": 0.6019, + "step": 1425 + }, + { + "epoch": 0.18, + "grad_norm": 1.2137603759765625, + "learning_rate": 9.401572184388268e-06, + "loss": 0.6522, + "step": 1426 + }, + { + "epoch": 0.18, + "grad_norm": 1.404771327972412, + "learning_rate": 9.400587431404726e-06, + "loss": 0.6633, + "step": 1427 + }, + { + "epoch": 0.18, + "grad_norm": 1.0881354808807373, + "learning_rate": 9.399601920509442e-06, + "loss": 0.6346, + "step": 1428 + }, + { + "epoch": 0.18, + "grad_norm": 1.1489696502685547, + "learning_rate": 9.398615651872154e-06, + "loss": 0.5722, + "step": 1429 + }, + { + "epoch": 0.18, + "grad_norm": 1.202345609664917, + "learning_rate": 9.397628625662724e-06, + "loss": 0.6491, + "step": 1430 + }, + { + "epoch": 0.18, + "grad_norm": 1.0614575147628784, + "learning_rate": 9.39664084205115e-06, + "loss": 0.6658, + "step": 1431 + }, + { + "epoch": 0.18, + "grad_norm": 1.1777621507644653, + "learning_rate": 9.395652301207556e-06, + "loss": 0.6096, + "step": 1432 + }, + { + "epoch": 0.18, + "grad_norm": 0.9109401106834412, + "learning_rate": 9.394663003302197e-06, + "loss": 0.5908, + "step": 1433 + }, + { + "epoch": 0.18, + "grad_norm": 1.0313465595245361, + "learning_rate": 9.393672948505461e-06, + "loss": 0.6819, + "step": 1434 + }, + { + "epoch": 0.18, + "grad_norm": 1.7536683082580566, + "learning_rate": 9.392682136987865e-06, + "loss": 0.6045, + "step": 1435 + }, + { + "epoch": 0.18, + "grad_norm": 1.0943320989608765, + "learning_rate": 9.391690568920055e-06, + "loss": 0.5611, + "step": 1436 + }, + { + "epoch": 0.18, + "grad_norm": 1.105167031288147, + "learning_rate": 9.390698244472808e-06, + "loss": 0.6414, + "step": 1437 + }, + { + "epoch": 0.18, + "grad_norm": 1.1738401651382446, + "learning_rate": 9.389705163817034e-06, + "loss": 0.7103, + "step": 1438 + }, + { + "epoch": 0.18, + "grad_norm": 1.1961818933486938, + "learning_rate": 9.388711327123769e-06, + "loss": 0.6541, + "step": 1439 + }, + { + "epoch": 0.18, + "grad_norm": 1.2521352767944336, + "learning_rate": 9.38771673456418e-06, + "loss": 0.6377, + "step": 1440 + }, + { + "epoch": 0.18, + "grad_norm": 1.9367631673812866, + "learning_rate": 9.386721386309569e-06, + "loss": 0.6122, + "step": 1441 + }, + { + "epoch": 0.18, + "grad_norm": 1.1017770767211914, + "learning_rate": 9.385725282531364e-06, + "loss": 0.5612, + "step": 1442 + }, + { + "epoch": 0.18, + "grad_norm": 1.280482530593872, + "learning_rate": 9.38472842340112e-06, + "loss": 0.672, + "step": 1443 + }, + { + "epoch": 0.18, + "grad_norm": 1.5452642440795898, + "learning_rate": 9.383730809090528e-06, + "loss": 0.6426, + "step": 1444 + }, + { + "epoch": 0.19, + "grad_norm": 1.0411386489868164, + "learning_rate": 9.382732439771409e-06, + "loss": 0.5851, + "step": 1445 + }, + { + "epoch": 0.19, + "grad_norm": 1.0093711614608765, + "learning_rate": 9.381733315615708e-06, + "loss": 0.662, + "step": 1446 + }, + { + "epoch": 0.19, + "grad_norm": 1.6546435356140137, + "learning_rate": 9.380733436795506e-06, + "loss": 0.5422, + "step": 1447 + }, + { + "epoch": 0.19, + "grad_norm": 1.1968332529067993, + "learning_rate": 9.379732803483011e-06, + "loss": 0.6345, + "step": 1448 + }, + { + "epoch": 0.19, + "grad_norm": 1.473617672920227, + "learning_rate": 9.378731415850561e-06, + "loss": 0.5548, + "step": 1449 + }, + { + "epoch": 0.19, + "grad_norm": 1.1579792499542236, + "learning_rate": 9.377729274070627e-06, + "loss": 0.6089, + "step": 1450 + }, + { + "epoch": 0.19, + "grad_norm": 1.1918591260910034, + "learning_rate": 9.376726378315806e-06, + "loss": 0.5618, + "step": 1451 + }, + { + "epoch": 0.19, + "grad_norm": 1.340204119682312, + "learning_rate": 9.375722728758826e-06, + "loss": 0.5787, + "step": 1452 + }, + { + "epoch": 0.19, + "grad_norm": 1.30586576461792, + "learning_rate": 9.374718325572547e-06, + "loss": 0.683, + "step": 1453 + }, + { + "epoch": 0.19, + "grad_norm": 1.144736409187317, + "learning_rate": 9.373713168929954e-06, + "loss": 0.6424, + "step": 1454 + }, + { + "epoch": 0.19, + "grad_norm": 1.587199091911316, + "learning_rate": 9.372707259004168e-06, + "loss": 0.6204, + "step": 1455 + }, + { + "epoch": 0.19, + "grad_norm": 1.1502057313919067, + "learning_rate": 9.371700595968437e-06, + "loss": 0.6197, + "step": 1456 + }, + { + "epoch": 0.19, + "grad_norm": 1.1308425664901733, + "learning_rate": 9.370693179996133e-06, + "loss": 0.5867, + "step": 1457 + }, + { + "epoch": 0.19, + "grad_norm": 1.8421180248260498, + "learning_rate": 9.369685011260768e-06, + "loss": 0.6537, + "step": 1458 + }, + { + "epoch": 0.19, + "grad_norm": 1.224798560142517, + "learning_rate": 9.368676089935978e-06, + "loss": 0.6643, + "step": 1459 + }, + { + "epoch": 0.19, + "grad_norm": 1.2730331420898438, + "learning_rate": 9.367666416195526e-06, + "loss": 0.6882, + "step": 1460 + }, + { + "epoch": 0.19, + "grad_norm": 1.4292099475860596, + "learning_rate": 9.366655990213311e-06, + "loss": 0.6895, + "step": 1461 + }, + { + "epoch": 0.19, + "grad_norm": 1.8112746477127075, + "learning_rate": 9.365644812163356e-06, + "loss": 0.7695, + "step": 1462 + }, + { + "epoch": 0.19, + "grad_norm": 1.171579360961914, + "learning_rate": 9.36463288221982e-06, + "loss": 0.648, + "step": 1463 + }, + { + "epoch": 0.19, + "grad_norm": 1.1482553482055664, + "learning_rate": 9.363620200556983e-06, + "loss": 0.6616, + "step": 1464 + }, + { + "epoch": 0.19, + "grad_norm": 1.1553841829299927, + "learning_rate": 9.36260676734926e-06, + "loss": 0.5983, + "step": 1465 + }, + { + "epoch": 0.19, + "grad_norm": 1.32172691822052, + "learning_rate": 9.361592582771195e-06, + "loss": 0.5921, + "step": 1466 + }, + { + "epoch": 0.19, + "grad_norm": 0.9891936779022217, + "learning_rate": 9.36057764699746e-06, + "loss": 0.6451, + "step": 1467 + }, + { + "epoch": 0.19, + "grad_norm": 1.3381367921829224, + "learning_rate": 9.359561960202857e-06, + "loss": 0.6476, + "step": 1468 + }, + { + "epoch": 0.19, + "grad_norm": 1.140139102935791, + "learning_rate": 9.35854552256232e-06, + "loss": 0.6169, + "step": 1469 + }, + { + "epoch": 0.19, + "grad_norm": 1.0977221727371216, + "learning_rate": 9.357528334250905e-06, + "loss": 0.5992, + "step": 1470 + }, + { + "epoch": 0.19, + "grad_norm": 1.3049180507659912, + "learning_rate": 9.356510395443804e-06, + "loss": 0.5462, + "step": 1471 + }, + { + "epoch": 0.19, + "grad_norm": 1.2993820905685425, + "learning_rate": 9.35549170631634e-06, + "loss": 0.6289, + "step": 1472 + }, + { + "epoch": 0.19, + "grad_norm": 1.1152647733688354, + "learning_rate": 9.354472267043955e-06, + "loss": 0.6773, + "step": 1473 + }, + { + "epoch": 0.19, + "grad_norm": 1.593841552734375, + "learning_rate": 9.353452077802233e-06, + "loss": 0.6024, + "step": 1474 + }, + { + "epoch": 0.19, + "grad_norm": 1.0295051336288452, + "learning_rate": 9.352431138766875e-06, + "loss": 0.556, + "step": 1475 + }, + { + "epoch": 0.19, + "grad_norm": 1.2775508165359497, + "learning_rate": 9.35140945011372e-06, + "loss": 0.6177, + "step": 1476 + }, + { + "epoch": 0.19, + "grad_norm": 1.1952461004257202, + "learning_rate": 9.350387012018734e-06, + "loss": 0.6117, + "step": 1477 + }, + { + "epoch": 0.19, + "grad_norm": 1.298227071762085, + "learning_rate": 9.34936382465801e-06, + "loss": 0.6345, + "step": 1478 + }, + { + "epoch": 0.19, + "grad_norm": 1.2173748016357422, + "learning_rate": 9.348339888207771e-06, + "loss": 0.6348, + "step": 1479 + }, + { + "epoch": 0.19, + "grad_norm": 1.5317448377609253, + "learning_rate": 9.347315202844371e-06, + "loss": 0.6326, + "step": 1480 + }, + { + "epoch": 0.19, + "grad_norm": 1.0272226333618164, + "learning_rate": 9.346289768744288e-06, + "loss": 0.7068, + "step": 1481 + }, + { + "epoch": 0.19, + "grad_norm": 0.982128381729126, + "learning_rate": 9.345263586084135e-06, + "loss": 0.5463, + "step": 1482 + }, + { + "epoch": 0.19, + "grad_norm": 0.9588432908058167, + "learning_rate": 9.344236655040649e-06, + "loss": 0.6088, + "step": 1483 + }, + { + "epoch": 0.19, + "grad_norm": 1.1015230417251587, + "learning_rate": 9.343208975790699e-06, + "loss": 0.6073, + "step": 1484 + }, + { + "epoch": 0.19, + "grad_norm": 1.1615246534347534, + "learning_rate": 9.342180548511283e-06, + "loss": 0.5965, + "step": 1485 + }, + { + "epoch": 0.19, + "grad_norm": 1.2907776832580566, + "learning_rate": 9.341151373379527e-06, + "loss": 0.6704, + "step": 1486 + }, + { + "epoch": 0.19, + "grad_norm": 1.2365469932556152, + "learning_rate": 9.340121450572681e-06, + "loss": 0.732, + "step": 1487 + }, + { + "epoch": 0.19, + "grad_norm": 1.2405096292495728, + "learning_rate": 9.339090780268133e-06, + "loss": 0.5716, + "step": 1488 + }, + { + "epoch": 0.19, + "grad_norm": 1.2856647968292236, + "learning_rate": 9.338059362643393e-06, + "loss": 0.6443, + "step": 1489 + }, + { + "epoch": 0.19, + "grad_norm": 1.0891637802124023, + "learning_rate": 9.337027197876103e-06, + "loss": 0.6259, + "step": 1490 + }, + { + "epoch": 0.19, + "grad_norm": 1.1378039121627808, + "learning_rate": 9.33599428614403e-06, + "loss": 0.6413, + "step": 1491 + }, + { + "epoch": 0.19, + "grad_norm": 1.0595463514328003, + "learning_rate": 9.334960627625075e-06, + "loss": 0.593, + "step": 1492 + }, + { + "epoch": 0.19, + "grad_norm": 1.307681918144226, + "learning_rate": 9.333926222497263e-06, + "loss": 0.5555, + "step": 1493 + }, + { + "epoch": 0.19, + "grad_norm": 2.9554941654205322, + "learning_rate": 9.332891070938749e-06, + "loss": 0.6792, + "step": 1494 + }, + { + "epoch": 0.19, + "grad_norm": 1.0907893180847168, + "learning_rate": 9.331855173127817e-06, + "loss": 0.6127, + "step": 1495 + }, + { + "epoch": 0.19, + "grad_norm": 1.8585306406021118, + "learning_rate": 9.33081852924288e-06, + "loss": 0.5657, + "step": 1496 + }, + { + "epoch": 0.19, + "grad_norm": 1.3523632287979126, + "learning_rate": 9.329781139462479e-06, + "loss": 0.6613, + "step": 1497 + }, + { + "epoch": 0.19, + "grad_norm": 1.0348542928695679, + "learning_rate": 9.328743003965283e-06, + "loss": 0.5763, + "step": 1498 + }, + { + "epoch": 0.19, + "grad_norm": 1.102908968925476, + "learning_rate": 9.32770412293009e-06, + "loss": 0.6235, + "step": 1499 + }, + { + "epoch": 0.19, + "grad_norm": 4.521458148956299, + "learning_rate": 9.326664496535825e-06, + "loss": 0.7002, + "step": 1500 + }, + { + "epoch": 0.19, + "grad_norm": 1.6622076034545898, + "learning_rate": 9.325624124961542e-06, + "loss": 0.587, + "step": 1501 + }, + { + "epoch": 0.19, + "grad_norm": 1.0094268321990967, + "learning_rate": 9.324583008386425e-06, + "loss": 0.6609, + "step": 1502 + }, + { + "epoch": 0.19, + "grad_norm": 1.205629825592041, + "learning_rate": 9.323541146989788e-06, + "loss": 0.5841, + "step": 1503 + }, + { + "epoch": 0.19, + "grad_norm": 1.3504383563995361, + "learning_rate": 9.322498540951067e-06, + "loss": 0.7027, + "step": 1504 + }, + { + "epoch": 0.19, + "grad_norm": 1.421724557876587, + "learning_rate": 9.321455190449828e-06, + "loss": 0.6843, + "step": 1505 + }, + { + "epoch": 0.19, + "grad_norm": 1.1611723899841309, + "learning_rate": 9.32041109566577e-06, + "loss": 0.6121, + "step": 1506 + }, + { + "epoch": 0.19, + "grad_norm": 1.239842176437378, + "learning_rate": 9.319366256778717e-06, + "loss": 0.7677, + "step": 1507 + }, + { + "epoch": 0.19, + "grad_norm": 1.2777602672576904, + "learning_rate": 9.318320673968622e-06, + "loss": 0.7695, + "step": 1508 + }, + { + "epoch": 0.19, + "grad_norm": 1.2827951908111572, + "learning_rate": 9.317274347415561e-06, + "loss": 0.6129, + "step": 1509 + }, + { + "epoch": 0.19, + "grad_norm": 1.05486261844635, + "learning_rate": 9.316227277299748e-06, + "loss": 0.6093, + "step": 1510 + }, + { + "epoch": 0.19, + "grad_norm": 1.1314477920532227, + "learning_rate": 9.315179463801518e-06, + "loss": 0.6235, + "step": 1511 + }, + { + "epoch": 0.19, + "grad_norm": 1.0239319801330566, + "learning_rate": 9.314130907101332e-06, + "loss": 0.5927, + "step": 1512 + }, + { + "epoch": 0.19, + "grad_norm": 1.2008589506149292, + "learning_rate": 9.313081607379786e-06, + "loss": 0.6737, + "step": 1513 + }, + { + "epoch": 0.19, + "grad_norm": 1.1876063346862793, + "learning_rate": 9.3120315648176e-06, + "loss": 0.5764, + "step": 1514 + }, + { + "epoch": 0.19, + "grad_norm": 1.021418571472168, + "learning_rate": 9.310980779595623e-06, + "loss": 0.6656, + "step": 1515 + }, + { + "epoch": 0.19, + "grad_norm": 0.923373818397522, + "learning_rate": 9.309929251894828e-06, + "loss": 0.6674, + "step": 1516 + }, + { + "epoch": 0.19, + "grad_norm": 1.0926281213760376, + "learning_rate": 9.308876981896326e-06, + "loss": 0.6407, + "step": 1517 + }, + { + "epoch": 0.19, + "grad_norm": 1.034195065498352, + "learning_rate": 9.307823969781342e-06, + "loss": 0.6774, + "step": 1518 + }, + { + "epoch": 0.19, + "grad_norm": 1.0667943954467773, + "learning_rate": 9.30677021573124e-06, + "loss": 0.7005, + "step": 1519 + }, + { + "epoch": 0.19, + "grad_norm": 1.644808292388916, + "learning_rate": 9.305715719927507e-06, + "loss": 0.6978, + "step": 1520 + }, + { + "epoch": 0.19, + "grad_norm": 1.2867639064788818, + "learning_rate": 9.30466048255176e-06, + "loss": 0.6115, + "step": 1521 + }, + { + "epoch": 0.19, + "grad_norm": 1.1789501905441284, + "learning_rate": 9.303604503785737e-06, + "loss": 0.7755, + "step": 1522 + }, + { + "epoch": 0.2, + "grad_norm": 1.2231398820877075, + "learning_rate": 9.302547783811312e-06, + "loss": 0.741, + "step": 1523 + }, + { + "epoch": 0.2, + "grad_norm": 1.3856559991836548, + "learning_rate": 9.301490322810487e-06, + "loss": 0.632, + "step": 1524 + }, + { + "epoch": 0.2, + "grad_norm": 1.1552003622055054, + "learning_rate": 9.300432120965384e-06, + "loss": 0.6781, + "step": 1525 + }, + { + "epoch": 0.2, + "grad_norm": 1.2242902517318726, + "learning_rate": 9.299373178458255e-06, + "loss": 0.6174, + "step": 1526 + }, + { + "epoch": 0.2, + "grad_norm": 1.118219017982483, + "learning_rate": 9.298313495471486e-06, + "loss": 0.6034, + "step": 1527 + }, + { + "epoch": 0.2, + "grad_norm": 1.7142438888549805, + "learning_rate": 9.297253072187585e-06, + "loss": 0.6345, + "step": 1528 + }, + { + "epoch": 0.2, + "grad_norm": 1.1490740776062012, + "learning_rate": 9.296191908789186e-06, + "loss": 0.6571, + "step": 1529 + }, + { + "epoch": 0.2, + "grad_norm": 1.1473088264465332, + "learning_rate": 9.295130005459055e-06, + "loss": 0.5983, + "step": 1530 + }, + { + "epoch": 0.2, + "grad_norm": 1.0597072839736938, + "learning_rate": 9.294067362380081e-06, + "loss": 0.6132, + "step": 1531 + }, + { + "epoch": 0.2, + "grad_norm": 1.013634443283081, + "learning_rate": 9.293003979735284e-06, + "loss": 0.5525, + "step": 1532 + }, + { + "epoch": 0.2, + "grad_norm": 1.376117467880249, + "learning_rate": 9.291939857707812e-06, + "loss": 0.5656, + "step": 1533 + }, + { + "epoch": 0.2, + "grad_norm": 1.3028066158294678, + "learning_rate": 9.290874996480935e-06, + "loss": 0.6226, + "step": 1534 + }, + { + "epoch": 0.2, + "grad_norm": 1.4443479776382446, + "learning_rate": 9.289809396238054e-06, + "loss": 0.5352, + "step": 1535 + }, + { + "epoch": 0.2, + "grad_norm": 1.0546255111694336, + "learning_rate": 9.288743057162698e-06, + "loss": 0.586, + "step": 1536 + }, + { + "epoch": 0.2, + "grad_norm": 1.3574661016464233, + "learning_rate": 9.287675979438526e-06, + "loss": 0.6401, + "step": 1537 + }, + { + "epoch": 0.2, + "grad_norm": 1.0744688510894775, + "learning_rate": 9.286608163249314e-06, + "loss": 0.6846, + "step": 1538 + }, + { + "epoch": 0.2, + "grad_norm": 1.2303810119628906, + "learning_rate": 9.285539608778976e-06, + "loss": 0.6477, + "step": 1539 + }, + { + "epoch": 0.2, + "grad_norm": 1.0581923723220825, + "learning_rate": 9.284470316211545e-06, + "loss": 0.6514, + "step": 1540 + }, + { + "epoch": 0.2, + "grad_norm": 1.0219451189041138, + "learning_rate": 9.283400285731188e-06, + "loss": 0.6836, + "step": 1541 + }, + { + "epoch": 0.2, + "grad_norm": 1.6948091983795166, + "learning_rate": 9.282329517522196e-06, + "loss": 0.6725, + "step": 1542 + }, + { + "epoch": 0.2, + "grad_norm": 1.1440834999084473, + "learning_rate": 9.281258011768985e-06, + "loss": 0.5548, + "step": 1543 + }, + { + "epoch": 0.2, + "grad_norm": 1.2690272331237793, + "learning_rate": 9.280185768656103e-06, + "loss": 0.5611, + "step": 1544 + }, + { + "epoch": 0.2, + "grad_norm": 1.544965386390686, + "learning_rate": 9.279112788368218e-06, + "loss": 0.6296, + "step": 1545 + }, + { + "epoch": 0.2, + "grad_norm": 1.2523674964904785, + "learning_rate": 9.278039071090135e-06, + "loss": 0.671, + "step": 1546 + }, + { + "epoch": 0.2, + "grad_norm": 1.0705721378326416, + "learning_rate": 9.276964617006772e-06, + "loss": 0.653, + "step": 1547 + }, + { + "epoch": 0.2, + "grad_norm": 1.2598583698272705, + "learning_rate": 9.27588942630319e-06, + "loss": 0.6127, + "step": 1548 + }, + { + "epoch": 0.2, + "grad_norm": 1.0954736471176147, + "learning_rate": 9.274813499164563e-06, + "loss": 0.5496, + "step": 1549 + }, + { + "epoch": 0.2, + "grad_norm": 1.3450437784194946, + "learning_rate": 9.273736835776199e-06, + "loss": 0.6587, + "step": 1550 + }, + { + "epoch": 0.2, + "grad_norm": 1.1436505317687988, + "learning_rate": 9.272659436323535e-06, + "loss": 0.5947, + "step": 1551 + }, + { + "epoch": 0.2, + "grad_norm": 1.2184737920761108, + "learning_rate": 9.271581300992125e-06, + "loss": 0.6579, + "step": 1552 + }, + { + "epoch": 0.2, + "grad_norm": 1.0481288433074951, + "learning_rate": 9.27050242996766e-06, + "loss": 0.5941, + "step": 1553 + }, + { + "epoch": 0.2, + "grad_norm": 1.0766609907150269, + "learning_rate": 9.269422823435953e-06, + "loss": 0.7407, + "step": 1554 + }, + { + "epoch": 0.2, + "grad_norm": 1.0673633813858032, + "learning_rate": 9.268342481582944e-06, + "loss": 0.6136, + "step": 1555 + }, + { + "epoch": 0.2, + "grad_norm": 1.2412662506103516, + "learning_rate": 9.267261404594698e-06, + "loss": 0.6064, + "step": 1556 + }, + { + "epoch": 0.2, + "grad_norm": 1.0127815008163452, + "learning_rate": 9.266179592657414e-06, + "loss": 0.5653, + "step": 1557 + }, + { + "epoch": 0.2, + "grad_norm": 1.0416415929794312, + "learning_rate": 9.265097045957405e-06, + "loss": 0.6664, + "step": 1558 + }, + { + "epoch": 0.2, + "grad_norm": 1.0409266948699951, + "learning_rate": 9.264013764681123e-06, + "loss": 0.5291, + "step": 1559 + }, + { + "epoch": 0.2, + "grad_norm": 1.155242919921875, + "learning_rate": 9.26292974901514e-06, + "loss": 0.6171, + "step": 1560 + }, + { + "epoch": 0.2, + "grad_norm": 1.2648565769195557, + "learning_rate": 9.261844999146153e-06, + "loss": 0.618, + "step": 1561 + }, + { + "epoch": 0.2, + "grad_norm": 1.3313794136047363, + "learning_rate": 9.260759515260991e-06, + "loss": 0.6499, + "step": 1562 + }, + { + "epoch": 0.2, + "grad_norm": 0.9920714497566223, + "learning_rate": 9.259673297546606e-06, + "loss": 0.6458, + "step": 1563 + }, + { + "epoch": 0.2, + "grad_norm": 0.902812123298645, + "learning_rate": 9.258586346190077e-06, + "loss": 0.5976, + "step": 1564 + }, + { + "epoch": 0.2, + "grad_norm": 1.122768521308899, + "learning_rate": 9.257498661378608e-06, + "loss": 0.6489, + "step": 1565 + }, + { + "epoch": 0.2, + "grad_norm": 1.1595557928085327, + "learning_rate": 9.256410243299532e-06, + "loss": 0.6044, + "step": 1566 + }, + { + "epoch": 0.2, + "grad_norm": 1.0857417583465576, + "learning_rate": 9.255321092140305e-06, + "loss": 0.6606, + "step": 1567 + }, + { + "epoch": 0.2, + "grad_norm": 1.2797273397445679, + "learning_rate": 9.254231208088514e-06, + "loss": 0.5624, + "step": 1568 + }, + { + "epoch": 0.2, + "grad_norm": 1.0136442184448242, + "learning_rate": 9.253140591331868e-06, + "loss": 0.6768, + "step": 1569 + }, + { + "epoch": 0.2, + "grad_norm": 1.0762922763824463, + "learning_rate": 9.252049242058202e-06, + "loss": 0.693, + "step": 1570 + }, + { + "epoch": 0.2, + "grad_norm": 1.4632151126861572, + "learning_rate": 9.250957160455483e-06, + "loss": 0.658, + "step": 1571 + }, + { + "epoch": 0.2, + "grad_norm": 1.5439205169677734, + "learning_rate": 9.249864346711794e-06, + "loss": 0.602, + "step": 1572 + }, + { + "epoch": 0.2, + "grad_norm": 3.6240251064300537, + "learning_rate": 9.248770801015355e-06, + "loss": 0.6716, + "step": 1573 + }, + { + "epoch": 0.2, + "grad_norm": 1.2808942794799805, + "learning_rate": 9.247676523554503e-06, + "loss": 0.6153, + "step": 1574 + }, + { + "epoch": 0.2, + "grad_norm": 1.0833419561386108, + "learning_rate": 9.24658151451771e-06, + "loss": 0.5805, + "step": 1575 + }, + { + "epoch": 0.2, + "grad_norm": 2.3130383491516113, + "learning_rate": 9.245485774093563e-06, + "loss": 0.6547, + "step": 1576 + }, + { + "epoch": 0.2, + "grad_norm": 1.1251847743988037, + "learning_rate": 9.244389302470785e-06, + "loss": 0.7021, + "step": 1577 + }, + { + "epoch": 0.2, + "grad_norm": 1.2331297397613525, + "learning_rate": 9.243292099838222e-06, + "loss": 0.7117, + "step": 1578 + }, + { + "epoch": 0.2, + "grad_norm": 1.0849312543869019, + "learning_rate": 9.24219416638484e-06, + "loss": 0.6178, + "step": 1579 + }, + { + "epoch": 0.2, + "grad_norm": 1.2171435356140137, + "learning_rate": 9.24109550229974e-06, + "loss": 0.568, + "step": 1580 + }, + { + "epoch": 0.2, + "grad_norm": 0.9565621614456177, + "learning_rate": 9.239996107772144e-06, + "loss": 0.6697, + "step": 1581 + }, + { + "epoch": 0.2, + "grad_norm": 1.4817333221435547, + "learning_rate": 9.238895982991398e-06, + "loss": 0.6317, + "step": 1582 + }, + { + "epoch": 0.2, + "grad_norm": 1.3165608644485474, + "learning_rate": 9.23779512814698e-06, + "loss": 0.6251, + "step": 1583 + }, + { + "epoch": 0.2, + "grad_norm": 1.244059681892395, + "learning_rate": 9.236693543428485e-06, + "loss": 0.6704, + "step": 1584 + }, + { + "epoch": 0.2, + "grad_norm": 1.1906737089157104, + "learning_rate": 9.235591229025643e-06, + "loss": 0.6598, + "step": 1585 + }, + { + "epoch": 0.2, + "grad_norm": 1.268912434577942, + "learning_rate": 9.234488185128304e-06, + "loss": 0.5556, + "step": 1586 + }, + { + "epoch": 0.2, + "grad_norm": 1.1951463222503662, + "learning_rate": 9.233384411926442e-06, + "loss": 0.7621, + "step": 1587 + }, + { + "epoch": 0.2, + "grad_norm": 1.4412353038787842, + "learning_rate": 9.232279909610163e-06, + "loss": 0.7506, + "step": 1588 + }, + { + "epoch": 0.2, + "grad_norm": 1.2029129266738892, + "learning_rate": 9.231174678369695e-06, + "loss": 0.6591, + "step": 1589 + }, + { + "epoch": 0.2, + "grad_norm": 1.1525003910064697, + "learning_rate": 9.23006871839539e-06, + "loss": 0.6395, + "step": 1590 + }, + { + "epoch": 0.2, + "grad_norm": 1.305783748626709, + "learning_rate": 9.228962029877724e-06, + "loss": 0.6779, + "step": 1591 + }, + { + "epoch": 0.2, + "grad_norm": 1.1824591159820557, + "learning_rate": 9.227854613007308e-06, + "loss": 0.7231, + "step": 1592 + }, + { + "epoch": 0.2, + "grad_norm": 1.8407436609268188, + "learning_rate": 9.22674646797487e-06, + "loss": 0.5819, + "step": 1593 + }, + { + "epoch": 0.2, + "grad_norm": 1.2329612970352173, + "learning_rate": 9.225637594971265e-06, + "loss": 0.7076, + "step": 1594 + }, + { + "epoch": 0.2, + "grad_norm": 1.214455246925354, + "learning_rate": 9.224527994187471e-06, + "loss": 0.6108, + "step": 1595 + }, + { + "epoch": 0.2, + "grad_norm": 1.1704137325286865, + "learning_rate": 9.223417665814599e-06, + "loss": 0.6927, + "step": 1596 + }, + { + "epoch": 0.2, + "grad_norm": 1.149852991104126, + "learning_rate": 9.222306610043877e-06, + "loss": 0.602, + "step": 1597 + }, + { + "epoch": 0.2, + "grad_norm": 1.2092525959014893, + "learning_rate": 9.221194827066664e-06, + "loss": 0.5947, + "step": 1598 + }, + { + "epoch": 0.2, + "grad_norm": 1.0760937929153442, + "learning_rate": 9.22008231707444e-06, + "loss": 0.5526, + "step": 1599 + }, + { + "epoch": 0.2, + "grad_norm": 1.1572587490081787, + "learning_rate": 9.218969080258816e-06, + "loss": 0.6291, + "step": 1600 + }, + { + "epoch": 0.21, + "grad_norm": 1.240919828414917, + "learning_rate": 9.217855116811519e-06, + "loss": 0.6204, + "step": 1601 + }, + { + "epoch": 0.21, + "grad_norm": 1.2830955982208252, + "learning_rate": 9.21674042692441e-06, + "loss": 0.6191, + "step": 1602 + }, + { + "epoch": 0.21, + "grad_norm": 1.104110598564148, + "learning_rate": 9.215625010789469e-06, + "loss": 0.6335, + "step": 1603 + }, + { + "epoch": 0.21, + "grad_norm": 1.0325019359588623, + "learning_rate": 9.214508868598807e-06, + "loss": 0.5526, + "step": 1604 + }, + { + "epoch": 0.21, + "grad_norm": 1.1608110666275024, + "learning_rate": 9.213392000544656e-06, + "loss": 0.6592, + "step": 1605 + }, + { + "epoch": 0.21, + "grad_norm": 1.1397209167480469, + "learning_rate": 9.212274406819373e-06, + "loss": 0.6102, + "step": 1606 + }, + { + "epoch": 0.21, + "grad_norm": 2.098311424255371, + "learning_rate": 9.211156087615442e-06, + "loss": 0.6441, + "step": 1607 + }, + { + "epoch": 0.21, + "grad_norm": 1.3748606443405151, + "learning_rate": 9.210037043125469e-06, + "loss": 0.6898, + "step": 1608 + }, + { + "epoch": 0.21, + "grad_norm": 1.2580170631408691, + "learning_rate": 9.208917273542188e-06, + "loss": 0.6937, + "step": 1609 + }, + { + "epoch": 0.21, + "grad_norm": 1.1992050409317017, + "learning_rate": 9.207796779058456e-06, + "loss": 0.7689, + "step": 1610 + }, + { + "epoch": 0.21, + "grad_norm": 1.173262596130371, + "learning_rate": 9.206675559867254e-06, + "loss": 0.5427, + "step": 1611 + }, + { + "epoch": 0.21, + "grad_norm": 1.1421955823898315, + "learning_rate": 9.205553616161692e-06, + "loss": 0.648, + "step": 1612 + }, + { + "epoch": 0.21, + "grad_norm": 1.1522105932235718, + "learning_rate": 9.204430948135e-06, + "loss": 0.6322, + "step": 1613 + }, + { + "epoch": 0.21, + "grad_norm": 1.0081772804260254, + "learning_rate": 9.203307555980536e-06, + "loss": 0.6584, + "step": 1614 + }, + { + "epoch": 0.21, + "grad_norm": 1.1464192867279053, + "learning_rate": 9.20218343989178e-06, + "loss": 0.7091, + "step": 1615 + }, + { + "epoch": 0.21, + "grad_norm": 1.0025413036346436, + "learning_rate": 9.20105860006234e-06, + "loss": 0.6502, + "step": 1616 + }, + { + "epoch": 0.21, + "grad_norm": 1.2751092910766602, + "learning_rate": 9.199933036685946e-06, + "loss": 0.6899, + "step": 1617 + }, + { + "epoch": 0.21, + "grad_norm": 1.9596514701843262, + "learning_rate": 9.198806749956453e-06, + "loss": 0.6055, + "step": 1618 + }, + { + "epoch": 0.21, + "grad_norm": 1.1237578392028809, + "learning_rate": 9.197679740067842e-06, + "loss": 0.7279, + "step": 1619 + }, + { + "epoch": 0.21, + "grad_norm": 1.0528181791305542, + "learning_rate": 9.196552007214215e-06, + "loss": 0.6048, + "step": 1620 + }, + { + "epoch": 0.21, + "grad_norm": 1.0963466167449951, + "learning_rate": 9.195423551589803e-06, + "loss": 0.6251, + "step": 1621 + }, + { + "epoch": 0.21, + "grad_norm": 1.2052900791168213, + "learning_rate": 9.194294373388962e-06, + "loss": 0.7226, + "step": 1622 + }, + { + "epoch": 0.21, + "grad_norm": 1.6359459161758423, + "learning_rate": 9.193164472806165e-06, + "loss": 0.6516, + "step": 1623 + }, + { + "epoch": 0.21, + "grad_norm": 1.1933037042617798, + "learning_rate": 9.192033850036018e-06, + "loss": 0.6134, + "step": 1624 + }, + { + "epoch": 0.21, + "grad_norm": 1.69362211227417, + "learning_rate": 9.190902505273247e-06, + "loss": 0.6487, + "step": 1625 + }, + { + "epoch": 0.21, + "grad_norm": 1.2256443500518799, + "learning_rate": 9.189770438712701e-06, + "loss": 0.5863, + "step": 1626 + }, + { + "epoch": 0.21, + "grad_norm": 1.431551218032837, + "learning_rate": 9.188637650549357e-06, + "loss": 0.6076, + "step": 1627 + }, + { + "epoch": 0.21, + "grad_norm": 1.1389654874801636, + "learning_rate": 9.187504140978316e-06, + "loss": 0.6828, + "step": 1628 + }, + { + "epoch": 0.21, + "grad_norm": 1.3692309856414795, + "learning_rate": 9.1863699101948e-06, + "loss": 0.609, + "step": 1629 + }, + { + "epoch": 0.21, + "grad_norm": 1.4854897260665894, + "learning_rate": 9.18523495839416e-06, + "loss": 0.5828, + "step": 1630 + }, + { + "epoch": 0.21, + "grad_norm": 1.3503080606460571, + "learning_rate": 9.184099285771865e-06, + "loss": 0.6239, + "step": 1631 + }, + { + "epoch": 0.21, + "grad_norm": 1.0891162157058716, + "learning_rate": 9.182962892523515e-06, + "loss": 0.7232, + "step": 1632 + }, + { + "epoch": 0.21, + "grad_norm": 2.0536813735961914, + "learning_rate": 9.181825778844826e-06, + "loss": 0.5698, + "step": 1633 + }, + { + "epoch": 0.21, + "grad_norm": 1.0989372730255127, + "learning_rate": 9.180687944931646e-06, + "loss": 0.7344, + "step": 1634 + }, + { + "epoch": 0.21, + "grad_norm": 1.1523288488388062, + "learning_rate": 9.179549390979946e-06, + "loss": 0.6036, + "step": 1635 + }, + { + "epoch": 0.21, + "grad_norm": 1.1294163465499878, + "learning_rate": 9.178410117185811e-06, + "loss": 0.6317, + "step": 1636 + }, + { + "epoch": 0.21, + "grad_norm": 1.0369744300842285, + "learning_rate": 9.177270123745466e-06, + "loss": 0.6539, + "step": 1637 + }, + { + "epoch": 0.21, + "grad_norm": 1.0784070491790771, + "learning_rate": 9.176129410855248e-06, + "loss": 0.7082, + "step": 1638 + }, + { + "epoch": 0.21, + "grad_norm": 1.0242846012115479, + "learning_rate": 9.17498797871162e-06, + "loss": 0.6233, + "step": 1639 + }, + { + "epoch": 0.21, + "grad_norm": 1.3577667474746704, + "learning_rate": 9.173845827511176e-06, + "loss": 0.7456, + "step": 1640 + }, + { + "epoch": 0.21, + "grad_norm": 1.2746890783309937, + "learning_rate": 9.172702957450622e-06, + "loss": 0.6227, + "step": 1641 + }, + { + "epoch": 0.21, + "grad_norm": 1.0363303422927856, + "learning_rate": 9.171559368726798e-06, + "loss": 0.6176, + "step": 1642 + }, + { + "epoch": 0.21, + "grad_norm": 1.156578540802002, + "learning_rate": 9.170415061536661e-06, + "loss": 0.5688, + "step": 1643 + }, + { + "epoch": 0.21, + "grad_norm": 1.0655452013015747, + "learning_rate": 9.1692700360773e-06, + "loss": 0.6034, + "step": 1644 + }, + { + "epoch": 0.21, + "grad_norm": 1.3639652729034424, + "learning_rate": 9.168124292545917e-06, + "loss": 0.6289, + "step": 1645 + }, + { + "epoch": 0.21, + "grad_norm": 1.4169421195983887, + "learning_rate": 9.166977831139845e-06, + "loss": 0.6277, + "step": 1646 + }, + { + "epoch": 0.21, + "grad_norm": 1.0737788677215576, + "learning_rate": 9.165830652056537e-06, + "loss": 0.5081, + "step": 1647 + }, + { + "epoch": 0.21, + "grad_norm": 1.884833574295044, + "learning_rate": 9.164682755493574e-06, + "loss": 0.5832, + "step": 1648 + }, + { + "epoch": 0.21, + "grad_norm": 1.2018709182739258, + "learning_rate": 9.163534141648658e-06, + "loss": 0.602, + "step": 1649 + }, + { + "epoch": 0.21, + "grad_norm": 1.1860284805297852, + "learning_rate": 9.162384810719612e-06, + "loss": 0.605, + "step": 1650 + }, + { + "epoch": 0.21, + "grad_norm": 1.697682499885559, + "learning_rate": 9.161234762904386e-06, + "loss": 0.5906, + "step": 1651 + }, + { + "epoch": 0.21, + "grad_norm": 1.6724482774734497, + "learning_rate": 9.160083998401053e-06, + "loss": 0.7244, + "step": 1652 + }, + { + "epoch": 0.21, + "grad_norm": 1.3993253707885742, + "learning_rate": 9.158932517407806e-06, + "loss": 0.703, + "step": 1653 + }, + { + "epoch": 0.21, + "grad_norm": 1.4852396249771118, + "learning_rate": 9.15778032012297e-06, + "loss": 0.6691, + "step": 1654 + }, + { + "epoch": 0.21, + "grad_norm": 1.278127670288086, + "learning_rate": 9.15662740674498e-06, + "loss": 0.6629, + "step": 1655 + }, + { + "epoch": 0.21, + "grad_norm": 1.3181184530258179, + "learning_rate": 9.155473777472408e-06, + "loss": 0.6775, + "step": 1656 + }, + { + "epoch": 0.21, + "grad_norm": 1.3667001724243164, + "learning_rate": 9.15431943250394e-06, + "loss": 0.5703, + "step": 1657 + }, + { + "epoch": 0.21, + "grad_norm": 1.3015559911727905, + "learning_rate": 9.15316437203839e-06, + "loss": 0.65, + "step": 1658 + }, + { + "epoch": 0.21, + "grad_norm": 1.3116071224212646, + "learning_rate": 9.152008596274695e-06, + "loss": 0.5931, + "step": 1659 + }, + { + "epoch": 0.21, + "grad_norm": 1.37589430809021, + "learning_rate": 9.15085210541191e-06, + "loss": 0.5934, + "step": 1660 + }, + { + "epoch": 0.21, + "grad_norm": 1.3749538660049438, + "learning_rate": 9.149694899649218e-06, + "loss": 0.6775, + "step": 1661 + }, + { + "epoch": 0.21, + "grad_norm": 1.1061633825302124, + "learning_rate": 9.148536979185927e-06, + "loss": 0.6038, + "step": 1662 + }, + { + "epoch": 0.21, + "grad_norm": 2.6514101028442383, + "learning_rate": 9.147378344221462e-06, + "loss": 0.5828, + "step": 1663 + }, + { + "epoch": 0.21, + "grad_norm": 1.279590129852295, + "learning_rate": 9.146218994955378e-06, + "loss": 0.548, + "step": 1664 + }, + { + "epoch": 0.21, + "grad_norm": 1.0067777633666992, + "learning_rate": 9.145058931587345e-06, + "loss": 0.5791, + "step": 1665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2838937044143677, + "learning_rate": 9.143898154317164e-06, + "loss": 0.5998, + "step": 1666 + }, + { + "epoch": 0.21, + "grad_norm": 1.0655570030212402, + "learning_rate": 9.142736663344754e-06, + "loss": 0.7026, + "step": 1667 + }, + { + "epoch": 0.21, + "grad_norm": 1.2458590269088745, + "learning_rate": 9.141574458870156e-06, + "loss": 0.7518, + "step": 1668 + }, + { + "epoch": 0.21, + "grad_norm": 1.3690664768218994, + "learning_rate": 9.140411541093539e-06, + "loss": 0.5947, + "step": 1669 + }, + { + "epoch": 0.21, + "grad_norm": 1.4112671613693237, + "learning_rate": 9.139247910215192e-06, + "loss": 0.6794, + "step": 1670 + }, + { + "epoch": 0.21, + "grad_norm": 1.0723782777786255, + "learning_rate": 9.138083566435525e-06, + "loss": 0.6019, + "step": 1671 + }, + { + "epoch": 0.21, + "grad_norm": 1.5119447708129883, + "learning_rate": 9.136918509955074e-06, + "loss": 0.6187, + "step": 1672 + }, + { + "epoch": 0.21, + "grad_norm": 1.17743980884552, + "learning_rate": 9.135752740974495e-06, + "loss": 0.5778, + "step": 1673 + }, + { + "epoch": 0.21, + "grad_norm": 1.1854586601257324, + "learning_rate": 9.13458625969457e-06, + "loss": 0.6281, + "step": 1674 + }, + { + "epoch": 0.21, + "grad_norm": 0.9571298360824585, + "learning_rate": 9.133419066316198e-06, + "loss": 0.5768, + "step": 1675 + }, + { + "epoch": 0.21, + "grad_norm": 1.1036064624786377, + "learning_rate": 9.13225116104041e-06, + "loss": 0.5698, + "step": 1676 + }, + { + "epoch": 0.21, + "grad_norm": 1.1489185094833374, + "learning_rate": 9.131082544068346e-06, + "loss": 0.6844, + "step": 1677 + }, + { + "epoch": 0.21, + "grad_norm": 1.2321306467056274, + "learning_rate": 9.129913215601286e-06, + "loss": 0.6463, + "step": 1678 + }, + { + "epoch": 0.22, + "grad_norm": 1.2575052976608276, + "learning_rate": 9.128743175840615e-06, + "loss": 0.5624, + "step": 1679 + }, + { + "epoch": 0.22, + "grad_norm": 1.1586254835128784, + "learning_rate": 9.127572424987853e-06, + "loss": 0.7156, + "step": 1680 + }, + { + "epoch": 0.22, + "grad_norm": 1.7823494672775269, + "learning_rate": 9.126400963244636e-06, + "loss": 0.5968, + "step": 1681 + }, + { + "epoch": 0.22, + "grad_norm": 1.0591415166854858, + "learning_rate": 9.125228790812726e-06, + "loss": 0.7415, + "step": 1682 + }, + { + "epoch": 0.22, + "grad_norm": 1.2111726999282837, + "learning_rate": 9.124055907894004e-06, + "loss": 0.6182, + "step": 1683 + }, + { + "epoch": 0.22, + "grad_norm": 1.14926278591156, + "learning_rate": 9.12288231469048e-06, + "loss": 0.6119, + "step": 1684 + }, + { + "epoch": 0.22, + "grad_norm": 1.0378811359405518, + "learning_rate": 9.121708011404275e-06, + "loss": 0.6497, + "step": 1685 + }, + { + "epoch": 0.22, + "grad_norm": 1.4201372861862183, + "learning_rate": 9.120532998237642e-06, + "loss": 0.7219, + "step": 1686 + }, + { + "epoch": 0.22, + "grad_norm": 1.1894242763519287, + "learning_rate": 9.119357275392954e-06, + "loss": 0.7498, + "step": 1687 + }, + { + "epoch": 0.22, + "grad_norm": 1.3526839017868042, + "learning_rate": 9.118180843072705e-06, + "loss": 0.6255, + "step": 1688 + }, + { + "epoch": 0.22, + "grad_norm": 1.2016130685806274, + "learning_rate": 9.117003701479508e-06, + "loss": 0.6505, + "step": 1689 + }, + { + "epoch": 0.22, + "grad_norm": 1.1583237648010254, + "learning_rate": 9.115825850816106e-06, + "loss": 0.5844, + "step": 1690 + }, + { + "epoch": 0.22, + "grad_norm": 1.1343616247177124, + "learning_rate": 9.114647291285358e-06, + "loss": 0.6809, + "step": 1691 + }, + { + "epoch": 0.22, + "grad_norm": 1.766956090927124, + "learning_rate": 9.113468023090251e-06, + "loss": 0.6212, + "step": 1692 + }, + { + "epoch": 0.22, + "grad_norm": 1.2435486316680908, + "learning_rate": 9.112288046433883e-06, + "loss": 0.6192, + "step": 1693 + }, + { + "epoch": 0.22, + "grad_norm": 1.0251524448394775, + "learning_rate": 9.111107361519485e-06, + "loss": 0.5751, + "step": 1694 + }, + { + "epoch": 0.22, + "grad_norm": 1.0255200862884521, + "learning_rate": 9.109925968550405e-06, + "loss": 0.5642, + "step": 1695 + }, + { + "epoch": 0.22, + "grad_norm": 1.2505881786346436, + "learning_rate": 9.108743867730115e-06, + "loss": 0.6587, + "step": 1696 + }, + { + "epoch": 0.22, + "grad_norm": 1.3478747606277466, + "learning_rate": 9.107561059262207e-06, + "loss": 0.5907, + "step": 1697 + }, + { + "epoch": 0.22, + "grad_norm": 1.4244675636291504, + "learning_rate": 9.106377543350396e-06, + "loss": 0.6059, + "step": 1698 + }, + { + "epoch": 0.22, + "grad_norm": 0.9292632937431335, + "learning_rate": 9.105193320198518e-06, + "loss": 0.5866, + "step": 1699 + }, + { + "epoch": 0.22, + "grad_norm": 0.9981103539466858, + "learning_rate": 9.104008390010532e-06, + "loss": 0.6674, + "step": 1700 + }, + { + "epoch": 0.22, + "grad_norm": 1.207874059677124, + "learning_rate": 9.102822752990517e-06, + "loss": 0.6106, + "step": 1701 + }, + { + "epoch": 0.22, + "grad_norm": 1.0549036264419556, + "learning_rate": 9.101636409342676e-06, + "loss": 0.583, + "step": 1702 + }, + { + "epoch": 0.22, + "grad_norm": 1.4138764142990112, + "learning_rate": 9.100449359271333e-06, + "loss": 0.7072, + "step": 1703 + }, + { + "epoch": 0.22, + "grad_norm": 1.2411020994186401, + "learning_rate": 9.099261602980933e-06, + "loss": 0.6662, + "step": 1704 + }, + { + "epoch": 0.22, + "grad_norm": 1.9189475774765015, + "learning_rate": 9.098073140676043e-06, + "loss": 0.6922, + "step": 1705 + }, + { + "epoch": 0.22, + "grad_norm": 1.334502100944519, + "learning_rate": 9.096883972561347e-06, + "loss": 0.7025, + "step": 1706 + }, + { + "epoch": 0.22, + "grad_norm": 1.3240032196044922, + "learning_rate": 9.095694098841662e-06, + "loss": 0.6391, + "step": 1707 + }, + { + "epoch": 0.22, + "grad_norm": 1.224280595779419, + "learning_rate": 9.094503519721917e-06, + "loss": 0.5683, + "step": 1708 + }, + { + "epoch": 0.22, + "grad_norm": 1.2366915941238403, + "learning_rate": 9.093312235407166e-06, + "loss": 0.6315, + "step": 1709 + }, + { + "epoch": 0.22, + "grad_norm": 1.2290258407592773, + "learning_rate": 9.09212024610258e-06, + "loss": 0.6049, + "step": 1710 + }, + { + "epoch": 0.22, + "grad_norm": 1.0810284614562988, + "learning_rate": 9.090927552013457e-06, + "loss": 0.6721, + "step": 1711 + }, + { + "epoch": 0.22, + "grad_norm": 1.1469855308532715, + "learning_rate": 9.089734153345215e-06, + "loss": 0.6771, + "step": 1712 + }, + { + "epoch": 0.22, + "grad_norm": 1.5117367506027222, + "learning_rate": 9.088540050303392e-06, + "loss": 0.6422, + "step": 1713 + }, + { + "epoch": 0.22, + "grad_norm": 1.0608218908309937, + "learning_rate": 9.087345243093646e-06, + "loss": 0.6211, + "step": 1714 + }, + { + "epoch": 0.22, + "grad_norm": 1.1350992918014526, + "learning_rate": 9.086149731921763e-06, + "loss": 0.6627, + "step": 1715 + }, + { + "epoch": 0.22, + "grad_norm": 1.1392055749893188, + "learning_rate": 9.084953516993642e-06, + "loss": 0.574, + "step": 1716 + }, + { + "epoch": 0.22, + "grad_norm": 1.4839973449707031, + "learning_rate": 9.083756598515307e-06, + "loss": 0.6127, + "step": 1717 + }, + { + "epoch": 0.22, + "grad_norm": 1.2805958986282349, + "learning_rate": 9.082558976692904e-06, + "loss": 0.6159, + "step": 1718 + }, + { + "epoch": 0.22, + "grad_norm": 0.9901695847511292, + "learning_rate": 9.081360651732698e-06, + "loss": 0.5861, + "step": 1719 + }, + { + "epoch": 0.22, + "grad_norm": 1.3135817050933838, + "learning_rate": 9.080161623841077e-06, + "loss": 0.646, + "step": 1720 + }, + { + "epoch": 0.22, + "grad_norm": 1.2557127475738525, + "learning_rate": 9.078961893224548e-06, + "loss": 0.6455, + "step": 1721 + }, + { + "epoch": 0.22, + "grad_norm": 1.4702677726745605, + "learning_rate": 9.07776146008974e-06, + "loss": 0.6288, + "step": 1722 + }, + { + "epoch": 0.22, + "grad_norm": 1.4625836610794067, + "learning_rate": 9.076560324643405e-06, + "loss": 0.7098, + "step": 1723 + }, + { + "epoch": 0.22, + "grad_norm": 1.4593355655670166, + "learning_rate": 9.075358487092413e-06, + "loss": 0.6319, + "step": 1724 + }, + { + "epoch": 0.22, + "grad_norm": 1.3008965253829956, + "learning_rate": 9.074155947643757e-06, + "loss": 0.6806, + "step": 1725 + }, + { + "epoch": 0.22, + "grad_norm": 1.2681013345718384, + "learning_rate": 9.07295270650455e-06, + "loss": 0.575, + "step": 1726 + }, + { + "epoch": 0.22, + "grad_norm": 1.2825214862823486, + "learning_rate": 9.071748763882025e-06, + "loss": 0.5942, + "step": 1727 + }, + { + "epoch": 0.22, + "grad_norm": 1.12113356590271, + "learning_rate": 9.070544119983536e-06, + "loss": 0.6244, + "step": 1728 + }, + { + "epoch": 0.22, + "grad_norm": 1.394400715827942, + "learning_rate": 9.069338775016558e-06, + "loss": 0.7336, + "step": 1729 + }, + { + "epoch": 0.22, + "grad_norm": 1.3107945919036865, + "learning_rate": 9.06813272918869e-06, + "loss": 0.6463, + "step": 1730 + }, + { + "epoch": 0.22, + "grad_norm": 1.9889616966247559, + "learning_rate": 9.066925982707647e-06, + "loss": 0.5611, + "step": 1731 + }, + { + "epoch": 0.22, + "grad_norm": 1.0906484127044678, + "learning_rate": 9.065718535781266e-06, + "loss": 0.6513, + "step": 1732 + }, + { + "epoch": 0.22, + "grad_norm": 1.320393443107605, + "learning_rate": 9.064510388617507e-06, + "loss": 0.6201, + "step": 1733 + }, + { + "epoch": 0.22, + "grad_norm": 1.2902657985687256, + "learning_rate": 9.063301541424447e-06, + "loss": 0.662, + "step": 1734 + }, + { + "epoch": 0.22, + "grad_norm": 1.093662142753601, + "learning_rate": 9.062091994410286e-06, + "loss": 0.6321, + "step": 1735 + }, + { + "epoch": 0.22, + "grad_norm": 1.2275786399841309, + "learning_rate": 9.060881747783347e-06, + "loss": 0.5982, + "step": 1736 + }, + { + "epoch": 0.22, + "grad_norm": 1.4073659181594849, + "learning_rate": 9.059670801752065e-06, + "loss": 0.6775, + "step": 1737 + }, + { + "epoch": 0.22, + "grad_norm": 1.3687350749969482, + "learning_rate": 9.058459156525003e-06, + "loss": 0.7289, + "step": 1738 + }, + { + "epoch": 0.22, + "grad_norm": 1.213643193244934, + "learning_rate": 9.057246812310844e-06, + "loss": 0.6809, + "step": 1739 + }, + { + "epoch": 0.22, + "grad_norm": 1.2109642028808594, + "learning_rate": 9.056033769318387e-06, + "loss": 0.6457, + "step": 1740 + }, + { + "epoch": 0.22, + "grad_norm": 1.1009825468063354, + "learning_rate": 9.054820027756556e-06, + "loss": 0.6372, + "step": 1741 + }, + { + "epoch": 0.22, + "grad_norm": 0.8893618583679199, + "learning_rate": 9.05360558783439e-06, + "loss": 0.6498, + "step": 1742 + }, + { + "epoch": 0.22, + "grad_norm": 1.9988924264907837, + "learning_rate": 9.052390449761057e-06, + "loss": 0.6498, + "step": 1743 + }, + { + "epoch": 0.22, + "grad_norm": 1.142358422279358, + "learning_rate": 9.051174613745836e-06, + "loss": 0.5587, + "step": 1744 + }, + { + "epoch": 0.22, + "grad_norm": 1.34147047996521, + "learning_rate": 9.049958079998132e-06, + "loss": 0.6156, + "step": 1745 + }, + { + "epoch": 0.22, + "grad_norm": 1.456156849861145, + "learning_rate": 9.048740848727467e-06, + "loss": 0.6106, + "step": 1746 + }, + { + "epoch": 0.22, + "grad_norm": 1.0447920560836792, + "learning_rate": 9.047522920143483e-06, + "loss": 0.6259, + "step": 1747 + }, + { + "epoch": 0.22, + "grad_norm": 2.416192054748535, + "learning_rate": 9.046304294455945e-06, + "loss": 0.5878, + "step": 1748 + }, + { + "epoch": 0.22, + "grad_norm": 1.2161362171173096, + "learning_rate": 9.045084971874738e-06, + "loss": 0.6203, + "step": 1749 + }, + { + "epoch": 0.22, + "grad_norm": 1.0151557922363281, + "learning_rate": 9.043864952609863e-06, + "loss": 0.6412, + "step": 1750 + }, + { + "epoch": 0.22, + "grad_norm": 1.4537842273712158, + "learning_rate": 9.042644236871445e-06, + "loss": 0.6007, + "step": 1751 + }, + { + "epoch": 0.22, + "grad_norm": 1.3863743543624878, + "learning_rate": 9.041422824869729e-06, + "loss": 0.6443, + "step": 1752 + }, + { + "epoch": 0.22, + "grad_norm": 1.5908379554748535, + "learning_rate": 9.040200716815073e-06, + "loss": 0.5611, + "step": 1753 + }, + { + "epoch": 0.22, + "grad_norm": 1.6172839403152466, + "learning_rate": 9.038977912917963e-06, + "loss": 0.6122, + "step": 1754 + }, + { + "epoch": 0.22, + "grad_norm": 1.8064736127853394, + "learning_rate": 9.037754413389006e-06, + "loss": 0.6604, + "step": 1755 + }, + { + "epoch": 0.22, + "grad_norm": 1.4816584587097168, + "learning_rate": 9.03653021843892e-06, + "loss": 0.6819, + "step": 1756 + }, + { + "epoch": 0.23, + "grad_norm": 1.1716893911361694, + "learning_rate": 9.035305328278549e-06, + "loss": 0.6599, + "step": 1757 + }, + { + "epoch": 0.23, + "grad_norm": 1.0522955656051636, + "learning_rate": 9.034079743118857e-06, + "loss": 0.6139, + "step": 1758 + }, + { + "epoch": 0.23, + "grad_norm": 1.0910663604736328, + "learning_rate": 9.032853463170925e-06, + "loss": 0.7266, + "step": 1759 + }, + { + "epoch": 0.23, + "grad_norm": 1.1195260286331177, + "learning_rate": 9.031626488645955e-06, + "loss": 0.5692, + "step": 1760 + }, + { + "epoch": 0.23, + "grad_norm": 1.161078929901123, + "learning_rate": 9.030398819755268e-06, + "loss": 0.6371, + "step": 1761 + }, + { + "epoch": 0.23, + "grad_norm": 1.5670857429504395, + "learning_rate": 9.029170456710303e-06, + "loss": 0.5881, + "step": 1762 + }, + { + "epoch": 0.23, + "grad_norm": 2.10579252243042, + "learning_rate": 9.027941399722626e-06, + "loss": 0.6167, + "step": 1763 + }, + { + "epoch": 0.23, + "grad_norm": 1.177972435951233, + "learning_rate": 9.026711649003911e-06, + "loss": 0.6471, + "step": 1764 + }, + { + "epoch": 0.23, + "grad_norm": 1.4338864088058472, + "learning_rate": 9.025481204765963e-06, + "loss": 0.6252, + "step": 1765 + }, + { + "epoch": 0.23, + "grad_norm": 1.052895188331604, + "learning_rate": 9.024250067220697e-06, + "loss": 0.6478, + "step": 1766 + }, + { + "epoch": 0.23, + "grad_norm": 1.422892689704895, + "learning_rate": 9.023018236580154e-06, + "loss": 0.6252, + "step": 1767 + }, + { + "epoch": 0.23, + "grad_norm": 1.3154759407043457, + "learning_rate": 9.02178571305649e-06, + "loss": 0.5794, + "step": 1768 + }, + { + "epoch": 0.23, + "grad_norm": 1.0219439268112183, + "learning_rate": 9.020552496861982e-06, + "loss": 0.6179, + "step": 1769 + }, + { + "epoch": 0.23, + "grad_norm": 4.240836143493652, + "learning_rate": 9.019318588209028e-06, + "loss": 0.6145, + "step": 1770 + }, + { + "epoch": 0.23, + "grad_norm": 1.2236559391021729, + "learning_rate": 9.018083987310143e-06, + "loss": 0.7196, + "step": 1771 + }, + { + "epoch": 0.23, + "grad_norm": 1.407920479774475, + "learning_rate": 9.01684869437796e-06, + "loss": 0.5701, + "step": 1772 + }, + { + "epoch": 0.23, + "grad_norm": 1.0528749227523804, + "learning_rate": 9.015612709625236e-06, + "loss": 0.6588, + "step": 1773 + }, + { + "epoch": 0.23, + "grad_norm": 1.439102053642273, + "learning_rate": 9.014376033264845e-06, + "loss": 0.6773, + "step": 1774 + }, + { + "epoch": 0.23, + "grad_norm": 0.9866570830345154, + "learning_rate": 9.013138665509776e-06, + "loss": 0.6061, + "step": 1775 + }, + { + "epoch": 0.23, + "grad_norm": 1.7695170640945435, + "learning_rate": 9.011900606573142e-06, + "loss": 0.6408, + "step": 1776 + }, + { + "epoch": 0.23, + "grad_norm": 1.3377400636672974, + "learning_rate": 9.010661856668172e-06, + "loss": 0.7959, + "step": 1777 + }, + { + "epoch": 0.23, + "grad_norm": 1.6860862970352173, + "learning_rate": 9.00942241600822e-06, + "loss": 0.55, + "step": 1778 + }, + { + "epoch": 0.23, + "grad_norm": 1.0897938013076782, + "learning_rate": 9.00818228480675e-06, + "loss": 0.6778, + "step": 1779 + }, + { + "epoch": 0.23, + "grad_norm": 1.2090890407562256, + "learning_rate": 9.00694146327735e-06, + "loss": 0.7092, + "step": 1780 + }, + { + "epoch": 0.23, + "grad_norm": 1.0635035037994385, + "learning_rate": 9.005699951633727e-06, + "loss": 0.4973, + "step": 1781 + }, + { + "epoch": 0.23, + "grad_norm": 1.029817819595337, + "learning_rate": 9.004457750089709e-06, + "loss": 0.7106, + "step": 1782 + }, + { + "epoch": 0.23, + "grad_norm": 1.2057615518569946, + "learning_rate": 9.003214858859234e-06, + "loss": 0.7264, + "step": 1783 + }, + { + "epoch": 0.23, + "grad_norm": 1.096470594406128, + "learning_rate": 9.001971278156367e-06, + "loss": 0.6101, + "step": 1784 + }, + { + "epoch": 0.23, + "grad_norm": 1.1284945011138916, + "learning_rate": 9.000727008195293e-06, + "loss": 0.5879, + "step": 1785 + }, + { + "epoch": 0.23, + "grad_norm": 1.3269438743591309, + "learning_rate": 8.999482049190308e-06, + "loss": 0.6274, + "step": 1786 + }, + { + "epoch": 0.23, + "grad_norm": 1.038558006286621, + "learning_rate": 8.998236401355835e-06, + "loss": 0.5697, + "step": 1787 + }, + { + "epoch": 0.23, + "grad_norm": 1.3907923698425293, + "learning_rate": 8.996990064906408e-06, + "loss": 0.7388, + "step": 1788 + }, + { + "epoch": 0.23, + "grad_norm": 1.2428247928619385, + "learning_rate": 8.995743040056683e-06, + "loss": 0.6643, + "step": 1789 + }, + { + "epoch": 0.23, + "grad_norm": 1.2280868291854858, + "learning_rate": 8.994495327021438e-06, + "loss": 0.6722, + "step": 1790 + }, + { + "epoch": 0.23, + "grad_norm": 1.2366783618927002, + "learning_rate": 8.993246926015562e-06, + "loss": 0.6621, + "step": 1791 + }, + { + "epoch": 0.23, + "grad_norm": 1.1328561305999756, + "learning_rate": 8.99199783725407e-06, + "loss": 0.5697, + "step": 1792 + }, + { + "epoch": 0.23, + "grad_norm": 1.3674986362457275, + "learning_rate": 8.990748060952091e-06, + "loss": 0.6256, + "step": 1793 + }, + { + "epoch": 0.23, + "grad_norm": 1.234496831893921, + "learning_rate": 8.989497597324872e-06, + "loss": 0.6446, + "step": 1794 + }, + { + "epoch": 0.23, + "grad_norm": 1.1295185089111328, + "learning_rate": 8.988246446587781e-06, + "loss": 0.6178, + "step": 1795 + }, + { + "epoch": 0.23, + "grad_norm": 1.1279648542404175, + "learning_rate": 8.986994608956305e-06, + "loss": 0.6405, + "step": 1796 + }, + { + "epoch": 0.23, + "grad_norm": 1.143036961555481, + "learning_rate": 8.985742084646048e-06, + "loss": 0.6375, + "step": 1797 + }, + { + "epoch": 0.23, + "grad_norm": 1.3107563257217407, + "learning_rate": 8.984488873872728e-06, + "loss": 0.6425, + "step": 1798 + }, + { + "epoch": 0.23, + "grad_norm": 1.1655327081680298, + "learning_rate": 8.983234976852187e-06, + "loss": 0.7571, + "step": 1799 + }, + { + "epoch": 0.23, + "grad_norm": 1.195489764213562, + "learning_rate": 8.981980393800384e-06, + "loss": 0.6266, + "step": 1800 + }, + { + "epoch": 0.23, + "grad_norm": 1.3523964881896973, + "learning_rate": 8.980725124933396e-06, + "loss": 0.6549, + "step": 1801 + }, + { + "epoch": 0.23, + "grad_norm": 1.1617556810379028, + "learning_rate": 8.979469170467415e-06, + "loss": 0.62, + "step": 1802 + }, + { + "epoch": 0.23, + "grad_norm": 1.08810293674469, + "learning_rate": 8.978212530618756e-06, + "loss": 0.6809, + "step": 1803 + }, + { + "epoch": 0.23, + "grad_norm": 1.0384219884872437, + "learning_rate": 8.976955205603849e-06, + "loss": 0.5748, + "step": 1804 + }, + { + "epoch": 0.23, + "grad_norm": 0.9001789093017578, + "learning_rate": 8.975697195639242e-06, + "loss": 0.5481, + "step": 1805 + }, + { + "epoch": 0.23, + "grad_norm": 1.583070993423462, + "learning_rate": 8.974438500941603e-06, + "loss": 0.6438, + "step": 1806 + }, + { + "epoch": 0.23, + "grad_norm": 1.3476251363754272, + "learning_rate": 8.973179121727713e-06, + "loss": 0.6684, + "step": 1807 + }, + { + "epoch": 0.23, + "grad_norm": 1.0399832725524902, + "learning_rate": 8.97191905821448e-06, + "loss": 0.6654, + "step": 1808 + }, + { + "epoch": 0.23, + "grad_norm": 1.1862304210662842, + "learning_rate": 8.97065831061892e-06, + "loss": 0.6808, + "step": 1809 + }, + { + "epoch": 0.23, + "grad_norm": 1.1737608909606934, + "learning_rate": 8.969396879158173e-06, + "loss": 0.6566, + "step": 1810 + }, + { + "epoch": 0.23, + "grad_norm": 1.5188488960266113, + "learning_rate": 8.968134764049495e-06, + "loss": 0.6548, + "step": 1811 + }, + { + "epoch": 0.23, + "grad_norm": 1.1486880779266357, + "learning_rate": 8.96687196551026e-06, + "loss": 0.6055, + "step": 1812 + }, + { + "epoch": 0.23, + "grad_norm": 1.047976016998291, + "learning_rate": 8.965608483757958e-06, + "loss": 0.5543, + "step": 1813 + }, + { + "epoch": 0.23, + "grad_norm": 1.347322702407837, + "learning_rate": 8.964344319010196e-06, + "loss": 0.6678, + "step": 1814 + }, + { + "epoch": 0.23, + "grad_norm": 2.521556854248047, + "learning_rate": 8.963079471484707e-06, + "loss": 0.6393, + "step": 1815 + }, + { + "epoch": 0.23, + "grad_norm": 2.818633794784546, + "learning_rate": 8.96181394139933e-06, + "loss": 0.6287, + "step": 1816 + }, + { + "epoch": 0.23, + "grad_norm": 1.1293917894363403, + "learning_rate": 8.960547728972028e-06, + "loss": 0.6093, + "step": 1817 + }, + { + "epoch": 0.23, + "grad_norm": 4.344512462615967, + "learning_rate": 8.959280834420882e-06, + "loss": 0.5448, + "step": 1818 + }, + { + "epoch": 0.23, + "grad_norm": 1.3196654319763184, + "learning_rate": 8.958013257964086e-06, + "loss": 0.7376, + "step": 1819 + }, + { + "epoch": 0.23, + "grad_norm": 1.4868218898773193, + "learning_rate": 8.956744999819958e-06, + "loss": 0.6157, + "step": 1820 + }, + { + "epoch": 0.23, + "grad_norm": 1.2535372972488403, + "learning_rate": 8.955476060206928e-06, + "loss": 0.5815, + "step": 1821 + }, + { + "epoch": 0.23, + "grad_norm": 1.2110216617584229, + "learning_rate": 8.954206439343543e-06, + "loss": 0.6472, + "step": 1822 + }, + { + "epoch": 0.23, + "grad_norm": 1.1209574937820435, + "learning_rate": 8.95293613744847e-06, + "loss": 0.6802, + "step": 1823 + }, + { + "epoch": 0.23, + "grad_norm": 1.10364830493927, + "learning_rate": 8.951665154740495e-06, + "loss": 0.6275, + "step": 1824 + }, + { + "epoch": 0.23, + "grad_norm": 1.5042253732681274, + "learning_rate": 8.950393491438518e-06, + "loss": 0.5856, + "step": 1825 + }, + { + "epoch": 0.23, + "grad_norm": 1.1649775505065918, + "learning_rate": 8.949121147761556e-06, + "loss": 0.7074, + "step": 1826 + }, + { + "epoch": 0.23, + "grad_norm": 1.2775204181671143, + "learning_rate": 8.947848123928747e-06, + "loss": 0.6768, + "step": 1827 + }, + { + "epoch": 0.23, + "grad_norm": 1.109620213508606, + "learning_rate": 8.94657442015934e-06, + "loss": 0.6156, + "step": 1828 + }, + { + "epoch": 0.23, + "grad_norm": 1.454298973083496, + "learning_rate": 8.945300036672709e-06, + "loss": 0.5578, + "step": 1829 + }, + { + "epoch": 0.23, + "grad_norm": 1.3531197309494019, + "learning_rate": 8.944024973688334e-06, + "loss": 0.637, + "step": 1830 + }, + { + "epoch": 0.23, + "grad_norm": 1.2573683261871338, + "learning_rate": 8.942749231425824e-06, + "loss": 0.6422, + "step": 1831 + }, + { + "epoch": 0.23, + "grad_norm": 1.0720746517181396, + "learning_rate": 8.941472810104898e-06, + "loss": 0.6234, + "step": 1832 + }, + { + "epoch": 0.23, + "grad_norm": 1.301047444343567, + "learning_rate": 8.940195709945395e-06, + "loss": 0.6037, + "step": 1833 + }, + { + "epoch": 0.23, + "grad_norm": 1.0918940305709839, + "learning_rate": 8.938917931167268e-06, + "loss": 0.6048, + "step": 1834 + }, + { + "epoch": 0.24, + "grad_norm": 1.1322157382965088, + "learning_rate": 8.93763947399059e-06, + "loss": 0.6611, + "step": 1835 + }, + { + "epoch": 0.24, + "grad_norm": 1.2955923080444336, + "learning_rate": 8.936360338635546e-06, + "loss": 0.6222, + "step": 1836 + }, + { + "epoch": 0.24, + "grad_norm": 1.2364767789840698, + "learning_rate": 8.935080525322443e-06, + "loss": 0.6594, + "step": 1837 + }, + { + "epoch": 0.24, + "grad_norm": 1.4350496530532837, + "learning_rate": 8.933800034271706e-06, + "loss": 0.6246, + "step": 1838 + }, + { + "epoch": 0.24, + "grad_norm": 1.5731194019317627, + "learning_rate": 8.932518865703868e-06, + "loss": 0.6258, + "step": 1839 + }, + { + "epoch": 0.24, + "grad_norm": 1.1843388080596924, + "learning_rate": 8.931237019839587e-06, + "loss": 0.5611, + "step": 1840 + }, + { + "epoch": 0.24, + "grad_norm": 1.2916220426559448, + "learning_rate": 8.929954496899636e-06, + "loss": 0.6868, + "step": 1841 + }, + { + "epoch": 0.24, + "grad_norm": 1.1749646663665771, + "learning_rate": 8.928671297104901e-06, + "loss": 0.6049, + "step": 1842 + }, + { + "epoch": 0.24, + "grad_norm": 1.2070494890213013, + "learning_rate": 8.927387420676387e-06, + "loss": 0.6208, + "step": 1843 + }, + { + "epoch": 0.24, + "grad_norm": 1.1378085613250732, + "learning_rate": 8.92610286783522e-06, + "loss": 0.7296, + "step": 1844 + }, + { + "epoch": 0.24, + "grad_norm": 1.2360167503356934, + "learning_rate": 8.924817638802634e-06, + "loss": 0.6041, + "step": 1845 + }, + { + "epoch": 0.24, + "grad_norm": 1.0370523929595947, + "learning_rate": 8.923531733799984e-06, + "loss": 0.6482, + "step": 1846 + }, + { + "epoch": 0.24, + "grad_norm": 1.1742146015167236, + "learning_rate": 8.922245153048742e-06, + "loss": 0.5543, + "step": 1847 + }, + { + "epoch": 0.24, + "grad_norm": 0.9824729561805725, + "learning_rate": 8.920957896770495e-06, + "loss": 0.6877, + "step": 1848 + }, + { + "epoch": 0.24, + "grad_norm": 1.5906468629837036, + "learning_rate": 8.919669965186946e-06, + "loss": 0.6197, + "step": 1849 + }, + { + "epoch": 0.24, + "grad_norm": 1.0606296062469482, + "learning_rate": 8.918381358519916e-06, + "loss": 0.6899, + "step": 1850 + }, + { + "epoch": 0.24, + "grad_norm": 1.3511884212493896, + "learning_rate": 8.917092076991342e-06, + "loss": 0.677, + "step": 1851 + }, + { + "epoch": 0.24, + "grad_norm": 1.168999195098877, + "learning_rate": 8.915802120823274e-06, + "loss": 0.5945, + "step": 1852 + }, + { + "epoch": 0.24, + "grad_norm": 1.0430333614349365, + "learning_rate": 8.914511490237883e-06, + "loss": 0.5796, + "step": 1853 + }, + { + "epoch": 0.24, + "grad_norm": 1.0085581541061401, + "learning_rate": 8.913220185457455e-06, + "loss": 0.5656, + "step": 1854 + }, + { + "epoch": 0.24, + "grad_norm": 1.1767834424972534, + "learning_rate": 8.911928206704388e-06, + "loss": 0.6895, + "step": 1855 + }, + { + "epoch": 0.24, + "grad_norm": 0.9236755967140198, + "learning_rate": 8.910635554201199e-06, + "loss": 0.6282, + "step": 1856 + }, + { + "epoch": 0.24, + "grad_norm": 1.189577341079712, + "learning_rate": 8.909342228170523e-06, + "loss": 0.6037, + "step": 1857 + }, + { + "epoch": 0.24, + "grad_norm": 1.07065749168396, + "learning_rate": 8.90804822883511e-06, + "loss": 0.6259, + "step": 1858 + }, + { + "epoch": 0.24, + "grad_norm": 1.224584698677063, + "learning_rate": 8.906753556417822e-06, + "loss": 0.5854, + "step": 1859 + }, + { + "epoch": 0.24, + "grad_norm": 1.136846899986267, + "learning_rate": 8.905458211141642e-06, + "loss": 0.6162, + "step": 1860 + }, + { + "epoch": 0.24, + "grad_norm": 1.1061546802520752, + "learning_rate": 8.904162193229667e-06, + "loss": 0.6708, + "step": 1861 + }, + { + "epoch": 0.24, + "grad_norm": 1.1807464361190796, + "learning_rate": 8.90286550290511e-06, + "loss": 0.6058, + "step": 1862 + }, + { + "epoch": 0.24, + "grad_norm": 1.3680741786956787, + "learning_rate": 8.901568140391298e-06, + "loss": 0.5691, + "step": 1863 + }, + { + "epoch": 0.24, + "grad_norm": 1.45723557472229, + "learning_rate": 8.900270105911676e-06, + "loss": 0.6373, + "step": 1864 + }, + { + "epoch": 0.24, + "grad_norm": 1.1721493005752563, + "learning_rate": 8.898971399689804e-06, + "loss": 0.6277, + "step": 1865 + }, + { + "epoch": 0.24, + "grad_norm": 1.2810320854187012, + "learning_rate": 8.89767202194936e-06, + "loss": 0.647, + "step": 1866 + }, + { + "epoch": 0.24, + "grad_norm": 1.231530785560608, + "learning_rate": 8.896371972914131e-06, + "loss": 0.6348, + "step": 1867 + }, + { + "epoch": 0.24, + "grad_norm": 1.2142878770828247, + "learning_rate": 8.895071252808025e-06, + "loss": 0.6003, + "step": 1868 + }, + { + "epoch": 0.24, + "grad_norm": 1.8936800956726074, + "learning_rate": 8.893769861855068e-06, + "loss": 0.6248, + "step": 1869 + }, + { + "epoch": 0.24, + "grad_norm": 1.1831871271133423, + "learning_rate": 8.892467800279396e-06, + "loss": 0.6167, + "step": 1870 + }, + { + "epoch": 0.24, + "grad_norm": 1.351455807685852, + "learning_rate": 8.891165068305263e-06, + "loss": 0.6668, + "step": 1871 + }, + { + "epoch": 0.24, + "grad_norm": 1.335632562637329, + "learning_rate": 8.889861666157038e-06, + "loss": 0.6294, + "step": 1872 + }, + { + "epoch": 0.24, + "grad_norm": 1.1500890254974365, + "learning_rate": 8.888557594059204e-06, + "loss": 0.6425, + "step": 1873 + }, + { + "epoch": 0.24, + "grad_norm": 1.1159440279006958, + "learning_rate": 8.887252852236365e-06, + "loss": 0.6158, + "step": 1874 + }, + { + "epoch": 0.24, + "grad_norm": 1.4507861137390137, + "learning_rate": 8.885947440913232e-06, + "loss": 0.6023, + "step": 1875 + }, + { + "epoch": 0.24, + "grad_norm": 1.169786810874939, + "learning_rate": 8.884641360314636e-06, + "loss": 0.6775, + "step": 1876 + }, + { + "epoch": 0.24, + "grad_norm": 1.3057630062103271, + "learning_rate": 8.883334610665527e-06, + "loss": 0.5631, + "step": 1877 + }, + { + "epoch": 0.24, + "grad_norm": 1.4330273866653442, + "learning_rate": 8.88202719219096e-06, + "loss": 0.6352, + "step": 1878 + }, + { + "epoch": 0.24, + "grad_norm": 1.038002610206604, + "learning_rate": 8.880719105116116e-06, + "loss": 0.6006, + "step": 1879 + }, + { + "epoch": 0.24, + "grad_norm": 0.988121747970581, + "learning_rate": 8.879410349666284e-06, + "loss": 0.5431, + "step": 1880 + }, + { + "epoch": 0.24, + "grad_norm": 3.9103357791900635, + "learning_rate": 8.87810092606687e-06, + "loss": 0.6212, + "step": 1881 + }, + { + "epoch": 0.24, + "grad_norm": 1.3641910552978516, + "learning_rate": 8.876790834543398e-06, + "loss": 0.7146, + "step": 1882 + }, + { + "epoch": 0.24, + "grad_norm": 1.3473871946334839, + "learning_rate": 8.875480075321506e-06, + "loss": 0.6231, + "step": 1883 + }, + { + "epoch": 0.24, + "grad_norm": 1.0874309539794922, + "learning_rate": 8.87416864862694e-06, + "loss": 0.6109, + "step": 1884 + }, + { + "epoch": 0.24, + "grad_norm": 1.3928847312927246, + "learning_rate": 8.872856554685569e-06, + "loss": 0.6334, + "step": 1885 + }, + { + "epoch": 0.24, + "grad_norm": 1.6365965604782104, + "learning_rate": 8.871543793723378e-06, + "loss": 0.6463, + "step": 1886 + }, + { + "epoch": 0.24, + "grad_norm": 1.2042670249938965, + "learning_rate": 8.870230365966459e-06, + "loss": 0.5847, + "step": 1887 + }, + { + "epoch": 0.24, + "grad_norm": 1.115470290184021, + "learning_rate": 8.868916271641025e-06, + "loss": 0.6091, + "step": 1888 + }, + { + "epoch": 0.24, + "grad_norm": 1.1585477590560913, + "learning_rate": 8.867601510973402e-06, + "loss": 0.7887, + "step": 1889 + }, + { + "epoch": 0.24, + "grad_norm": 1.2588825225830078, + "learning_rate": 8.86628608419003e-06, + "loss": 0.4943, + "step": 1890 + }, + { + "epoch": 0.24, + "grad_norm": 1.139786720275879, + "learning_rate": 8.864969991517465e-06, + "loss": 0.5754, + "step": 1891 + }, + { + "epoch": 0.24, + "grad_norm": 1.179455280303955, + "learning_rate": 8.86365323318238e-06, + "loss": 0.5672, + "step": 1892 + }, + { + "epoch": 0.24, + "grad_norm": 1.169512152671814, + "learning_rate": 8.862335809411556e-06, + "loss": 0.5903, + "step": 1893 + }, + { + "epoch": 0.24, + "grad_norm": 1.500291347503662, + "learning_rate": 8.861017720431893e-06, + "loss": 0.7036, + "step": 1894 + }, + { + "epoch": 0.24, + "grad_norm": 1.1209545135498047, + "learning_rate": 8.859698966470404e-06, + "loss": 0.6231, + "step": 1895 + }, + { + "epoch": 0.24, + "grad_norm": 1.0172909498214722, + "learning_rate": 8.858379547754222e-06, + "loss": 0.6046, + "step": 1896 + }, + { + "epoch": 0.24, + "grad_norm": 1.06539785861969, + "learning_rate": 8.857059464510586e-06, + "loss": 0.5999, + "step": 1897 + }, + { + "epoch": 0.24, + "grad_norm": 1.3168922662734985, + "learning_rate": 8.855738716966857e-06, + "loss": 0.5925, + "step": 1898 + }, + { + "epoch": 0.24, + "grad_norm": 0.9954319596290588, + "learning_rate": 8.854417305350503e-06, + "loss": 0.5778, + "step": 1899 + }, + { + "epoch": 0.24, + "grad_norm": 1.1088371276855469, + "learning_rate": 8.853095229889112e-06, + "loss": 0.5955, + "step": 1900 + }, + { + "epoch": 0.24, + "grad_norm": 1.4451113939285278, + "learning_rate": 8.851772490810386e-06, + "loss": 0.5305, + "step": 1901 + }, + { + "epoch": 0.24, + "grad_norm": 1.0991573333740234, + "learning_rate": 8.850449088342138e-06, + "loss": 0.7227, + "step": 1902 + }, + { + "epoch": 0.24, + "grad_norm": 1.025597333908081, + "learning_rate": 8.849125022712297e-06, + "loss": 0.5666, + "step": 1903 + }, + { + "epoch": 0.24, + "grad_norm": 1.0423365831375122, + "learning_rate": 8.847800294148908e-06, + "loss": 0.6944, + "step": 1904 + }, + { + "epoch": 0.24, + "grad_norm": 1.0984046459197998, + "learning_rate": 8.846474902880128e-06, + "loss": 0.6633, + "step": 1905 + }, + { + "epoch": 0.24, + "grad_norm": 1.16025710105896, + "learning_rate": 8.845148849134228e-06, + "loss": 0.7596, + "step": 1906 + }, + { + "epoch": 0.24, + "grad_norm": 1.2320754528045654, + "learning_rate": 8.843822133139595e-06, + "loss": 0.5805, + "step": 1907 + }, + { + "epoch": 0.24, + "grad_norm": 1.1268361806869507, + "learning_rate": 8.842494755124728e-06, + "loss": 0.6417, + "step": 1908 + }, + { + "epoch": 0.24, + "grad_norm": 1.4205336570739746, + "learning_rate": 8.84116671531824e-06, + "loss": 0.684, + "step": 1909 + }, + { + "epoch": 0.24, + "grad_norm": 1.1660581827163696, + "learning_rate": 8.839838013948861e-06, + "loss": 0.6393, + "step": 1910 + }, + { + "epoch": 0.24, + "grad_norm": 1.339078426361084, + "learning_rate": 8.838508651245432e-06, + "loss": 0.6015, + "step": 1911 + }, + { + "epoch": 0.24, + "grad_norm": 1.1819093227386475, + "learning_rate": 8.837178627436907e-06, + "loss": 0.5398, + "step": 1912 + }, + { + "epoch": 0.25, + "grad_norm": 1.0426304340362549, + "learning_rate": 8.835847942752357e-06, + "loss": 0.5841, + "step": 1913 + }, + { + "epoch": 0.25, + "grad_norm": 1.4607350826263428, + "learning_rate": 8.834516597420968e-06, + "loss": 0.5586, + "step": 1914 + }, + { + "epoch": 0.25, + "grad_norm": 1.166911244392395, + "learning_rate": 8.833184591672033e-06, + "loss": 0.6171, + "step": 1915 + }, + { + "epoch": 0.25, + "grad_norm": 1.135672688484192, + "learning_rate": 8.831851925734963e-06, + "loss": 0.6544, + "step": 1916 + }, + { + "epoch": 0.25, + "grad_norm": 0.9493619799613953, + "learning_rate": 8.830518599839286e-06, + "loss": 0.6211, + "step": 1917 + }, + { + "epoch": 0.25, + "grad_norm": 1.2883530855178833, + "learning_rate": 8.829184614214637e-06, + "loss": 0.6184, + "step": 1918 + }, + { + "epoch": 0.25, + "grad_norm": 1.0521215200424194, + "learning_rate": 8.82784996909077e-06, + "loss": 0.6081, + "step": 1919 + }, + { + "epoch": 0.25, + "grad_norm": 1.2523962259292603, + "learning_rate": 8.82651466469755e-06, + "loss": 0.6249, + "step": 1920 + }, + { + "epoch": 0.25, + "grad_norm": 1.0267293453216553, + "learning_rate": 8.825178701264957e-06, + "loss": 0.5879, + "step": 1921 + }, + { + "epoch": 0.25, + "grad_norm": 1.191742181777954, + "learning_rate": 8.82384207902308e-06, + "loss": 0.7142, + "step": 1922 + }, + { + "epoch": 0.25, + "grad_norm": 1.205351710319519, + "learning_rate": 8.822504798202128e-06, + "loss": 0.6608, + "step": 1923 + }, + { + "epoch": 0.25, + "grad_norm": 1.1526292562484741, + "learning_rate": 8.821166859032419e-06, + "loss": 0.6627, + "step": 1924 + }, + { + "epoch": 0.25, + "grad_norm": 1.3558284044265747, + "learning_rate": 8.819828261744388e-06, + "loss": 0.6308, + "step": 1925 + }, + { + "epoch": 0.25, + "grad_norm": 1.1486369371414185, + "learning_rate": 8.81848900656858e-06, + "loss": 0.6342, + "step": 1926 + }, + { + "epoch": 0.25, + "grad_norm": 2.2741756439208984, + "learning_rate": 8.817149093735654e-06, + "loss": 0.5495, + "step": 1927 + }, + { + "epoch": 0.25, + "grad_norm": 1.269870638847351, + "learning_rate": 8.815808523476383e-06, + "loss": 0.645, + "step": 1928 + }, + { + "epoch": 0.25, + "grad_norm": 1.1454700231552124, + "learning_rate": 8.814467296021652e-06, + "loss": 0.6326, + "step": 1929 + }, + { + "epoch": 0.25, + "grad_norm": 1.2435215711593628, + "learning_rate": 8.813125411602463e-06, + "loss": 0.6898, + "step": 1930 + }, + { + "epoch": 0.25, + "grad_norm": 1.1720244884490967, + "learning_rate": 8.811782870449925e-06, + "loss": 0.6177, + "step": 1931 + }, + { + "epoch": 0.25, + "grad_norm": 1.859727382659912, + "learning_rate": 8.810439672795266e-06, + "loss": 0.6215, + "step": 1932 + }, + { + "epoch": 0.25, + "grad_norm": 1.085492491722107, + "learning_rate": 8.809095818869823e-06, + "loss": 0.6814, + "step": 1933 + }, + { + "epoch": 0.25, + "grad_norm": 1.1132259368896484, + "learning_rate": 8.807751308905049e-06, + "loss": 0.6428, + "step": 1934 + }, + { + "epoch": 0.25, + "grad_norm": 1.0994184017181396, + "learning_rate": 8.806406143132507e-06, + "loss": 0.6146, + "step": 1935 + }, + { + "epoch": 0.25, + "grad_norm": 1.1863535642623901, + "learning_rate": 8.805060321783873e-06, + "loss": 0.7188, + "step": 1936 + }, + { + "epoch": 0.25, + "grad_norm": 1.321311116218567, + "learning_rate": 8.803713845090942e-06, + "loss": 0.5959, + "step": 1937 + }, + { + "epoch": 0.25, + "grad_norm": 1.1888467073440552, + "learning_rate": 8.802366713285612e-06, + "loss": 0.6004, + "step": 1938 + }, + { + "epoch": 0.25, + "grad_norm": 1.0325742959976196, + "learning_rate": 8.801018926599904e-06, + "loss": 0.5642, + "step": 1939 + }, + { + "epoch": 0.25, + "grad_norm": 1.123363971710205, + "learning_rate": 8.799670485265944e-06, + "loss": 0.708, + "step": 1940 + }, + { + "epoch": 0.25, + "grad_norm": 1.2410331964492798, + "learning_rate": 8.798321389515974e-06, + "loss": 0.6157, + "step": 1941 + }, + { + "epoch": 0.25, + "grad_norm": 1.289198398590088, + "learning_rate": 8.796971639582347e-06, + "loss": 0.6785, + "step": 1942 + }, + { + "epoch": 0.25, + "grad_norm": 1.5738413333892822, + "learning_rate": 8.795621235697531e-06, + "loss": 0.6329, + "step": 1943 + }, + { + "epoch": 0.25, + "grad_norm": 1.011083722114563, + "learning_rate": 8.79427017809411e-06, + "loss": 0.5909, + "step": 1944 + }, + { + "epoch": 0.25, + "grad_norm": 1.3928797245025635, + "learning_rate": 8.792918467004767e-06, + "loss": 0.5906, + "step": 1945 + }, + { + "epoch": 0.25, + "grad_norm": 0.9257722496986389, + "learning_rate": 8.791566102662315e-06, + "loss": 0.5623, + "step": 1946 + }, + { + "epoch": 0.25, + "grad_norm": 1.2923310995101929, + "learning_rate": 8.790213085299668e-06, + "loss": 0.6385, + "step": 1947 + }, + { + "epoch": 0.25, + "grad_norm": 1.4501917362213135, + "learning_rate": 8.788859415149856e-06, + "loss": 0.5815, + "step": 1948 + }, + { + "epoch": 0.25, + "grad_norm": 1.2182177305221558, + "learning_rate": 8.787505092446022e-06, + "loss": 0.6108, + "step": 1949 + }, + { + "epoch": 0.25, + "grad_norm": 1.0286319255828857, + "learning_rate": 8.786150117421418e-06, + "loss": 0.5499, + "step": 1950 + }, + { + "epoch": 0.25, + "grad_norm": 1.2858717441558838, + "learning_rate": 8.784794490309414e-06, + "loss": 0.5579, + "step": 1951 + }, + { + "epoch": 0.25, + "grad_norm": 0.9833030104637146, + "learning_rate": 8.783438211343487e-06, + "loss": 0.6406, + "step": 1952 + }, + { + "epoch": 0.25, + "grad_norm": 1.0329986810684204, + "learning_rate": 8.78208128075723e-06, + "loss": 0.6064, + "step": 1953 + }, + { + "epoch": 0.25, + "grad_norm": 1.3916441202163696, + "learning_rate": 8.780723698784346e-06, + "loss": 0.6235, + "step": 1954 + }, + { + "epoch": 0.25, + "grad_norm": 1.7905875444412231, + "learning_rate": 8.77936546565865e-06, + "loss": 0.6405, + "step": 1955 + }, + { + "epoch": 0.25, + "grad_norm": 1.13296377658844, + "learning_rate": 8.778006581614073e-06, + "loss": 0.6188, + "step": 1956 + }, + { + "epoch": 0.25, + "grad_norm": 1.1696197986602783, + "learning_rate": 8.776647046884651e-06, + "loss": 0.5943, + "step": 1957 + }, + { + "epoch": 0.25, + "grad_norm": 1.2003610134124756, + "learning_rate": 8.77528686170454e-06, + "loss": 0.6799, + "step": 1958 + }, + { + "epoch": 0.25, + "grad_norm": 1.1605165004730225, + "learning_rate": 8.773926026308002e-06, + "loss": 0.7245, + "step": 1959 + }, + { + "epoch": 0.25, + "grad_norm": 1.8035575151443481, + "learning_rate": 8.772564540929414e-06, + "loss": 0.6746, + "step": 1960 + }, + { + "epoch": 0.25, + "grad_norm": 1.1232562065124512, + "learning_rate": 8.771202405803263e-06, + "loss": 0.6389, + "step": 1961 + }, + { + "epoch": 0.25, + "grad_norm": 1.3930180072784424, + "learning_rate": 8.769839621164152e-06, + "loss": 0.5986, + "step": 1962 + }, + { + "epoch": 0.25, + "grad_norm": 1.1030189990997314, + "learning_rate": 8.768476187246789e-06, + "loss": 0.579, + "step": 1963 + }, + { + "epoch": 0.25, + "grad_norm": 1.1885334253311157, + "learning_rate": 8.767112104286003e-06, + "loss": 0.6221, + "step": 1964 + }, + { + "epoch": 0.25, + "grad_norm": 1.8045053482055664, + "learning_rate": 8.765747372516723e-06, + "loss": 0.59, + "step": 1965 + }, + { + "epoch": 0.25, + "grad_norm": 1.1481117010116577, + "learning_rate": 8.764381992174001e-06, + "loss": 0.6523, + "step": 1966 + }, + { + "epoch": 0.25, + "grad_norm": 1.3269027471542358, + "learning_rate": 8.763015963492996e-06, + "loss": 0.6512, + "step": 1967 + }, + { + "epoch": 0.25, + "grad_norm": 0.9991157054901123, + "learning_rate": 8.761649286708975e-06, + "loss": 0.5713, + "step": 1968 + }, + { + "epoch": 0.25, + "grad_norm": 1.0284345149993896, + "learning_rate": 8.760281962057324e-06, + "loss": 0.6516, + "step": 1969 + }, + { + "epoch": 0.25, + "grad_norm": 1.1772648096084595, + "learning_rate": 8.758913989773536e-06, + "loss": 0.7054, + "step": 1970 + }, + { + "epoch": 0.25, + "grad_norm": 1.2631373405456543, + "learning_rate": 8.757545370093216e-06, + "loss": 0.727, + "step": 1971 + }, + { + "epoch": 0.25, + "grad_norm": 1.1898554563522339, + "learning_rate": 8.756176103252082e-06, + "loss": 0.601, + "step": 1972 + }, + { + "epoch": 0.25, + "grad_norm": 1.192469835281372, + "learning_rate": 8.754806189485959e-06, + "loss": 0.5665, + "step": 1973 + }, + { + "epoch": 0.25, + "grad_norm": 0.9782890677452087, + "learning_rate": 8.75343562903079e-06, + "loss": 0.5972, + "step": 1974 + }, + { + "epoch": 0.25, + "grad_norm": 1.5428776741027832, + "learning_rate": 8.752064422122625e-06, + "loss": 0.6663, + "step": 1975 + }, + { + "epoch": 0.25, + "grad_norm": 1.1398931741714478, + "learning_rate": 8.750692568997629e-06, + "loss": 0.666, + "step": 1976 + }, + { + "epoch": 0.25, + "grad_norm": 1.2742382287979126, + "learning_rate": 8.74932006989207e-06, + "loss": 0.6918, + "step": 1977 + }, + { + "epoch": 0.25, + "grad_norm": 1.2991219758987427, + "learning_rate": 8.747946925042341e-06, + "loss": 0.6471, + "step": 1978 + }, + { + "epoch": 0.25, + "grad_norm": 1.6621350049972534, + "learning_rate": 8.746573134684932e-06, + "loss": 0.5951, + "step": 1979 + }, + { + "epoch": 0.25, + "grad_norm": 1.5337145328521729, + "learning_rate": 8.745198699056452e-06, + "loss": 0.5993, + "step": 1980 + }, + { + "epoch": 0.25, + "grad_norm": 1.0378293991088867, + "learning_rate": 8.74382361839362e-06, + "loss": 0.6211, + "step": 1981 + }, + { + "epoch": 0.25, + "grad_norm": 1.1881955862045288, + "learning_rate": 8.742447892933266e-06, + "loss": 0.6135, + "step": 1982 + }, + { + "epoch": 0.25, + "grad_norm": 1.375791311264038, + "learning_rate": 8.741071522912331e-06, + "loss": 0.7713, + "step": 1983 + }, + { + "epoch": 0.25, + "grad_norm": 1.263593077659607, + "learning_rate": 8.739694508567866e-06, + "loss": 0.6795, + "step": 1984 + }, + { + "epoch": 0.25, + "grad_norm": 1.0440095663070679, + "learning_rate": 8.738316850137034e-06, + "loss": 0.4923, + "step": 1985 + }, + { + "epoch": 0.25, + "grad_norm": 1.112327218055725, + "learning_rate": 8.736938547857109e-06, + "loss": 0.6235, + "step": 1986 + }, + { + "epoch": 0.25, + "grad_norm": 1.110875129699707, + "learning_rate": 8.735559601965475e-06, + "loss": 0.6164, + "step": 1987 + }, + { + "epoch": 0.25, + "grad_norm": 1.1973077058792114, + "learning_rate": 8.734180012699628e-06, + "loss": 0.5878, + "step": 1988 + }, + { + "epoch": 0.25, + "grad_norm": 1.1539205312728882, + "learning_rate": 8.732799780297174e-06, + "loss": 0.7529, + "step": 1989 + }, + { + "epoch": 0.25, + "grad_norm": 1.2167625427246094, + "learning_rate": 8.731418904995829e-06, + "loss": 0.6641, + "step": 1990 + }, + { + "epoch": 0.26, + "grad_norm": 1.0853583812713623, + "learning_rate": 8.730037387033422e-06, + "loss": 0.598, + "step": 1991 + }, + { + "epoch": 0.26, + "grad_norm": 1.1032205820083618, + "learning_rate": 8.72865522664789e-06, + "loss": 0.643, + "step": 1992 + }, + { + "epoch": 0.26, + "grad_norm": 1.2810516357421875, + "learning_rate": 8.727272424077284e-06, + "loss": 0.5398, + "step": 1993 + }, + { + "epoch": 0.26, + "grad_norm": 1.2989377975463867, + "learning_rate": 8.725888979559762e-06, + "loss": 0.5852, + "step": 1994 + }, + { + "epoch": 0.26, + "grad_norm": 1.120161533355713, + "learning_rate": 8.724504893333596e-06, + "loss": 0.7027, + "step": 1995 + }, + { + "epoch": 0.26, + "grad_norm": 1.3071115016937256, + "learning_rate": 8.723120165637165e-06, + "loss": 0.5965, + "step": 1996 + }, + { + "epoch": 0.26, + "grad_norm": 1.1043663024902344, + "learning_rate": 8.72173479670896e-06, + "loss": 0.6167, + "step": 1997 + }, + { + "epoch": 0.26, + "grad_norm": 1.1847256422042847, + "learning_rate": 8.720348786787583e-06, + "loss": 0.6122, + "step": 1998 + }, + { + "epoch": 0.26, + "grad_norm": 1.1380722522735596, + "learning_rate": 8.718962136111749e-06, + "loss": 0.6251, + "step": 1999 + }, + { + "epoch": 0.26, + "grad_norm": 1.1924020051956177, + "learning_rate": 8.717574844920274e-06, + "loss": 0.5994, + "step": 2000 + }, + { + "epoch": 0.26, + "grad_norm": 1.000066876411438, + "learning_rate": 8.716186913452097e-06, + "loss": 0.6475, + "step": 2001 + }, + { + "epoch": 0.26, + "grad_norm": 1.0465896129608154, + "learning_rate": 8.714798341946258e-06, + "loss": 0.6021, + "step": 2002 + }, + { + "epoch": 0.26, + "grad_norm": 1.0609678030014038, + "learning_rate": 8.71340913064191e-06, + "loss": 0.5697, + "step": 2003 + }, + { + "epoch": 0.26, + "grad_norm": 1.1682281494140625, + "learning_rate": 8.712019279778319e-06, + "loss": 0.6211, + "step": 2004 + }, + { + "epoch": 0.26, + "grad_norm": 1.4044312238693237, + "learning_rate": 8.710628789594855e-06, + "loss": 0.6077, + "step": 2005 + }, + { + "epoch": 0.26, + "grad_norm": 1.1312463283538818, + "learning_rate": 8.709237660331003e-06, + "loss": 0.5921, + "step": 2006 + }, + { + "epoch": 0.26, + "grad_norm": 1.1439876556396484, + "learning_rate": 8.70784589222636e-06, + "loss": 0.5924, + "step": 2007 + }, + { + "epoch": 0.26, + "grad_norm": 1.3385566473007202, + "learning_rate": 8.706453485520622e-06, + "loss": 0.5244, + "step": 2008 + }, + { + "epoch": 0.26, + "grad_norm": 1.2704468965530396, + "learning_rate": 8.70506044045361e-06, + "loss": 0.5937, + "step": 2009 + }, + { + "epoch": 0.26, + "grad_norm": 1.1238104104995728, + "learning_rate": 8.703666757265246e-06, + "loss": 0.6783, + "step": 2010 + }, + { + "epoch": 0.26, + "grad_norm": 1.1684532165527344, + "learning_rate": 8.702272436195562e-06, + "loss": 0.6352, + "step": 2011 + }, + { + "epoch": 0.26, + "grad_norm": 1.1618826389312744, + "learning_rate": 8.700877477484704e-06, + "loss": 0.6676, + "step": 2012 + }, + { + "epoch": 0.26, + "grad_norm": 1.2076810598373413, + "learning_rate": 8.699481881372922e-06, + "loss": 0.6103, + "step": 2013 + }, + { + "epoch": 0.26, + "grad_norm": 1.1898448467254639, + "learning_rate": 8.698085648100581e-06, + "loss": 0.5716, + "step": 2014 + }, + { + "epoch": 0.26, + "grad_norm": 1.7246116399765015, + "learning_rate": 8.696688777908154e-06, + "loss": 0.5495, + "step": 2015 + }, + { + "epoch": 0.26, + "grad_norm": 1.511298656463623, + "learning_rate": 8.695291271036221e-06, + "loss": 0.6376, + "step": 2016 + }, + { + "epoch": 0.26, + "grad_norm": 1.2623487710952759, + "learning_rate": 8.69389312772548e-06, + "loss": 0.6639, + "step": 2017 + }, + { + "epoch": 0.26, + "grad_norm": 1.220610499382019, + "learning_rate": 8.692494348216726e-06, + "loss": 0.6363, + "step": 2018 + }, + { + "epoch": 0.26, + "grad_norm": 1.2728426456451416, + "learning_rate": 8.691094932750875e-06, + "loss": 0.6267, + "step": 2019 + }, + { + "epoch": 0.26, + "grad_norm": 1.6735302209854126, + "learning_rate": 8.689694881568945e-06, + "loss": 0.6491, + "step": 2020 + }, + { + "epoch": 0.26, + "grad_norm": 1.3414764404296875, + "learning_rate": 8.688294194912066e-06, + "loss": 0.6314, + "step": 2021 + }, + { + "epoch": 0.26, + "grad_norm": 1.5352545976638794, + "learning_rate": 8.686892873021481e-06, + "loss": 0.6437, + "step": 2022 + }, + { + "epoch": 0.26, + "grad_norm": 1.3221575021743774, + "learning_rate": 8.685490916138536e-06, + "loss": 0.6519, + "step": 2023 + }, + { + "epoch": 0.26, + "grad_norm": 1.5757750272750854, + "learning_rate": 8.684088324504694e-06, + "loss": 0.6486, + "step": 2024 + }, + { + "epoch": 0.26, + "grad_norm": 1.2225582599639893, + "learning_rate": 8.682685098361518e-06, + "loss": 0.7509, + "step": 2025 + }, + { + "epoch": 0.26, + "grad_norm": 1.0095558166503906, + "learning_rate": 8.681281237950688e-06, + "loss": 0.6723, + "step": 2026 + }, + { + "epoch": 0.26, + "grad_norm": 0.9800388216972351, + "learning_rate": 8.67987674351399e-06, + "loss": 0.5663, + "step": 2027 + }, + { + "epoch": 0.26, + "grad_norm": 1.777536392211914, + "learning_rate": 8.678471615293317e-06, + "loss": 0.6395, + "step": 2028 + }, + { + "epoch": 0.26, + "grad_norm": 1.0923036336898804, + "learning_rate": 8.677065853530679e-06, + "loss": 0.5539, + "step": 2029 + }, + { + "epoch": 0.26, + "grad_norm": 1.2774169445037842, + "learning_rate": 8.675659458468186e-06, + "loss": 0.6695, + "step": 2030 + }, + { + "epoch": 0.26, + "grad_norm": 1.2363957166671753, + "learning_rate": 8.67425243034806e-06, + "loss": 0.6806, + "step": 2031 + }, + { + "epoch": 0.26, + "grad_norm": 1.3562581539154053, + "learning_rate": 8.672844769412637e-06, + "loss": 0.6637, + "step": 2032 + }, + { + "epoch": 0.26, + "grad_norm": 1.290300965309143, + "learning_rate": 8.671436475904353e-06, + "loss": 0.6027, + "step": 2033 + }, + { + "epoch": 0.26, + "grad_norm": 1.1578395366668701, + "learning_rate": 8.670027550065763e-06, + "loss": 0.6095, + "step": 2034 + }, + { + "epoch": 0.26, + "grad_norm": 3.5597174167633057, + "learning_rate": 8.668617992139524e-06, + "loss": 0.6364, + "step": 2035 + }, + { + "epoch": 0.26, + "grad_norm": 0.9307906031608582, + "learning_rate": 8.667207802368403e-06, + "loss": 0.671, + "step": 2036 + }, + { + "epoch": 0.26, + "grad_norm": 1.2509186267852783, + "learning_rate": 8.665796980995275e-06, + "loss": 0.6422, + "step": 2037 + }, + { + "epoch": 0.26, + "grad_norm": 2.086897373199463, + "learning_rate": 8.66438552826313e-06, + "loss": 0.6447, + "step": 2038 + }, + { + "epoch": 0.26, + "grad_norm": 1.172615885734558, + "learning_rate": 8.662973444415058e-06, + "loss": 0.562, + "step": 2039 + }, + { + "epoch": 0.26, + "grad_norm": 1.004035234451294, + "learning_rate": 8.661560729694262e-06, + "loss": 0.515, + "step": 2040 + }, + { + "epoch": 0.26, + "grad_norm": 1.1104687452316284, + "learning_rate": 8.660147384344055e-06, + "loss": 0.5915, + "step": 2041 + }, + { + "epoch": 0.26, + "grad_norm": 1.2272541522979736, + "learning_rate": 8.658733408607856e-06, + "loss": 0.5539, + "step": 2042 + }, + { + "epoch": 0.26, + "grad_norm": 1.0483099222183228, + "learning_rate": 8.657318802729194e-06, + "loss": 0.5457, + "step": 2043 + }, + { + "epoch": 0.26, + "grad_norm": 1.2460724115371704, + "learning_rate": 8.655903566951706e-06, + "loss": 0.6596, + "step": 2044 + }, + { + "epoch": 0.26, + "grad_norm": 0.8781019449234009, + "learning_rate": 8.654487701519139e-06, + "loss": 0.5678, + "step": 2045 + }, + { + "epoch": 0.26, + "grad_norm": 1.4468796253204346, + "learning_rate": 8.653071206675344e-06, + "loss": 0.6264, + "step": 2046 + }, + { + "epoch": 0.26, + "grad_norm": 5.015211582183838, + "learning_rate": 8.651654082664285e-06, + "loss": 0.5887, + "step": 2047 + }, + { + "epoch": 0.26, + "grad_norm": 1.4183422327041626, + "learning_rate": 8.650236329730034e-06, + "loss": 0.6769, + "step": 2048 + }, + { + "epoch": 0.26, + "grad_norm": 2.0147705078125, + "learning_rate": 8.648817948116767e-06, + "loss": 0.6663, + "step": 2049 + }, + { + "epoch": 0.26, + "grad_norm": 1.3201289176940918, + "learning_rate": 8.647398938068775e-06, + "loss": 0.6155, + "step": 2050 + }, + { + "epoch": 0.26, + "grad_norm": 1.1508480310440063, + "learning_rate": 8.645979299830452e-06, + "loss": 0.6329, + "step": 2051 + }, + { + "epoch": 0.26, + "grad_norm": 1.3195370435714722, + "learning_rate": 8.644559033646303e-06, + "loss": 0.6606, + "step": 2052 + }, + { + "epoch": 0.26, + "grad_norm": 1.2289732694625854, + "learning_rate": 8.643138139760935e-06, + "loss": 0.5887, + "step": 2053 + }, + { + "epoch": 0.26, + "grad_norm": 1.2333985567092896, + "learning_rate": 8.641716618419076e-06, + "loss": 0.7597, + "step": 2054 + }, + { + "epoch": 0.26, + "grad_norm": 1.6038957834243774, + "learning_rate": 8.640294469865548e-06, + "loss": 0.563, + "step": 2055 + }, + { + "epoch": 0.26, + "grad_norm": 1.6648248434066772, + "learning_rate": 8.638871694345293e-06, + "loss": 0.6064, + "step": 2056 + }, + { + "epoch": 0.26, + "grad_norm": 1.1484403610229492, + "learning_rate": 8.637448292103346e-06, + "loss": 0.6087, + "step": 2057 + }, + { + "epoch": 0.26, + "grad_norm": 1.367751955986023, + "learning_rate": 8.636024263384868e-06, + "loss": 0.6503, + "step": 2058 + }, + { + "epoch": 0.26, + "grad_norm": 1.4821258783340454, + "learning_rate": 8.634599608435115e-06, + "loss": 0.7098, + "step": 2059 + }, + { + "epoch": 0.26, + "grad_norm": 2.0166525840759277, + "learning_rate": 8.633174327499456e-06, + "loss": 0.6259, + "step": 2060 + }, + { + "epoch": 0.26, + "grad_norm": 1.2003066539764404, + "learning_rate": 8.631748420823365e-06, + "loss": 0.7077, + "step": 2061 + }, + { + "epoch": 0.26, + "grad_norm": 1.448062539100647, + "learning_rate": 8.630321888652426e-06, + "loss": 0.694, + "step": 2062 + }, + { + "epoch": 0.26, + "grad_norm": 2.6138148307800293, + "learning_rate": 8.628894731232332e-06, + "loss": 0.6076, + "step": 2063 + }, + { + "epoch": 0.26, + "grad_norm": 1.1501845121383667, + "learning_rate": 8.62746694880888e-06, + "loss": 0.5773, + "step": 2064 + }, + { + "epoch": 0.26, + "grad_norm": 1.4958134889602661, + "learning_rate": 8.626038541627977e-06, + "loss": 0.5991, + "step": 2065 + }, + { + "epoch": 0.26, + "grad_norm": 1.0958569049835205, + "learning_rate": 8.624609509935637e-06, + "loss": 0.5845, + "step": 2066 + }, + { + "epoch": 0.26, + "grad_norm": 1.1663223505020142, + "learning_rate": 8.623179853977984e-06, + "loss": 0.5782, + "step": 2067 + }, + { + "epoch": 0.26, + "grad_norm": 1.1979085206985474, + "learning_rate": 8.621749574001241e-06, + "loss": 0.6255, + "step": 2068 + }, + { + "epoch": 0.27, + "grad_norm": 1.3670035600662231, + "learning_rate": 8.620318670251752e-06, + "loss": 0.6218, + "step": 2069 + }, + { + "epoch": 0.27, + "grad_norm": 1.0638803243637085, + "learning_rate": 8.618887142975956e-06, + "loss": 0.696, + "step": 2070 + }, + { + "epoch": 0.27, + "grad_norm": 2.1462111473083496, + "learning_rate": 8.617454992420407e-06, + "loss": 0.6679, + "step": 2071 + }, + { + "epoch": 0.27, + "grad_norm": 1.3509697914123535, + "learning_rate": 8.616022218831764e-06, + "loss": 0.6722, + "step": 2072 + }, + { + "epoch": 0.27, + "grad_norm": 1.0871156454086304, + "learning_rate": 8.61458882245679e-06, + "loss": 0.5614, + "step": 2073 + }, + { + "epoch": 0.27, + "grad_norm": 1.0885635614395142, + "learning_rate": 8.613154803542362e-06, + "loss": 0.6469, + "step": 2074 + }, + { + "epoch": 0.27, + "grad_norm": 1.2749704122543335, + "learning_rate": 8.611720162335459e-06, + "loss": 0.6835, + "step": 2075 + }, + { + "epoch": 0.27, + "grad_norm": 1.2243422269821167, + "learning_rate": 8.61028489908317e-06, + "loss": 0.5278, + "step": 2076 + }, + { + "epoch": 0.27, + "grad_norm": 1.17266845703125, + "learning_rate": 8.608849014032687e-06, + "loss": 0.6967, + "step": 2077 + }, + { + "epoch": 0.27, + "grad_norm": 1.2027003765106201, + "learning_rate": 8.607412507431316e-06, + "loss": 0.7052, + "step": 2078 + }, + { + "epoch": 0.27, + "grad_norm": 1.0147993564605713, + "learning_rate": 8.605975379526463e-06, + "loss": 0.6634, + "step": 2079 + }, + { + "epoch": 0.27, + "grad_norm": 1.7742810249328613, + "learning_rate": 8.604537630565644e-06, + "loss": 0.6074, + "step": 2080 + }, + { + "epoch": 0.27, + "grad_norm": 1.3109650611877441, + "learning_rate": 8.603099260796486e-06, + "loss": 0.6322, + "step": 2081 + }, + { + "epoch": 0.27, + "grad_norm": 1.1907052993774414, + "learning_rate": 8.601660270466714e-06, + "loss": 0.6935, + "step": 2082 + }, + { + "epoch": 0.27, + "grad_norm": 1.3169562816619873, + "learning_rate": 8.600220659824166e-06, + "loss": 0.6313, + "step": 2083 + }, + { + "epoch": 0.27, + "grad_norm": 1.1314212083816528, + "learning_rate": 8.598780429116788e-06, + "loss": 0.5839, + "step": 2084 + }, + { + "epoch": 0.27, + "grad_norm": 1.2062609195709229, + "learning_rate": 8.59733957859263e-06, + "loss": 0.6198, + "step": 2085 + }, + { + "epoch": 0.27, + "grad_norm": 0.9913787245750427, + "learning_rate": 8.595898108499845e-06, + "loss": 0.6855, + "step": 2086 + }, + { + "epoch": 0.27, + "grad_norm": 1.212138056755066, + "learning_rate": 8.594456019086702e-06, + "loss": 0.5885, + "step": 2087 + }, + { + "epoch": 0.27, + "grad_norm": 1.0849419832229614, + "learning_rate": 8.59301331060157e-06, + "loss": 0.5638, + "step": 2088 + }, + { + "epoch": 0.27, + "grad_norm": 4.176388263702393, + "learning_rate": 8.591569983292924e-06, + "loss": 0.5899, + "step": 2089 + }, + { + "epoch": 0.27, + "grad_norm": 1.0811898708343506, + "learning_rate": 8.590126037409353e-06, + "loss": 0.6131, + "step": 2090 + }, + { + "epoch": 0.27, + "grad_norm": 1.1859910488128662, + "learning_rate": 8.588681473199543e-06, + "loss": 0.5915, + "step": 2091 + }, + { + "epoch": 0.27, + "grad_norm": 1.8125941753387451, + "learning_rate": 8.587236290912292e-06, + "loss": 0.66, + "step": 2092 + }, + { + "epoch": 0.27, + "grad_norm": 1.1952428817749023, + "learning_rate": 8.585790490796502e-06, + "loss": 0.5988, + "step": 2093 + }, + { + "epoch": 0.27, + "grad_norm": 1.172929048538208, + "learning_rate": 8.584344073101185e-06, + "loss": 0.6717, + "step": 2094 + }, + { + "epoch": 0.27, + "grad_norm": 1.0805124044418335, + "learning_rate": 8.582897038075455e-06, + "loss": 0.5679, + "step": 2095 + }, + { + "epoch": 0.27, + "grad_norm": 1.1243839263916016, + "learning_rate": 8.581449385968536e-06, + "loss": 0.6338, + "step": 2096 + }, + { + "epoch": 0.27, + "grad_norm": 1.2426873445510864, + "learning_rate": 8.580001117029755e-06, + "loss": 0.5016, + "step": 2097 + }, + { + "epoch": 0.27, + "grad_norm": 1.2226959466934204, + "learning_rate": 8.57855223150855e-06, + "loss": 0.628, + "step": 2098 + }, + { + "epoch": 0.27, + "grad_norm": 1.5704655647277832, + "learning_rate": 8.577102729654457e-06, + "loss": 0.6069, + "step": 2099 + }, + { + "epoch": 0.27, + "grad_norm": 1.1772339344024658, + "learning_rate": 8.575652611717127e-06, + "loss": 0.6016, + "step": 2100 + }, + { + "epoch": 0.27, + "grad_norm": 1.0823158025741577, + "learning_rate": 8.574201877946314e-06, + "loss": 0.6074, + "step": 2101 + }, + { + "epoch": 0.27, + "grad_norm": 1.330729365348816, + "learning_rate": 8.572750528591875e-06, + "loss": 0.6382, + "step": 2102 + }, + { + "epoch": 0.27, + "grad_norm": 1.2223377227783203, + "learning_rate": 8.571298563903775e-06, + "loss": 0.6634, + "step": 2103 + }, + { + "epoch": 0.27, + "grad_norm": 1.497209906578064, + "learning_rate": 8.56984598413209e-06, + "loss": 0.6778, + "step": 2104 + }, + { + "epoch": 0.27, + "grad_norm": 3.1404335498809814, + "learning_rate": 8.568392789526992e-06, + "loss": 0.6256, + "step": 2105 + }, + { + "epoch": 0.27, + "grad_norm": 1.1916518211364746, + "learning_rate": 8.566938980338765e-06, + "loss": 0.7344, + "step": 2106 + }, + { + "epoch": 0.27, + "grad_norm": 1.0679166316986084, + "learning_rate": 8.565484556817802e-06, + "loss": 0.6275, + "step": 2107 + }, + { + "epoch": 0.27, + "grad_norm": 1.1432031393051147, + "learning_rate": 8.564029519214594e-06, + "loss": 0.6351, + "step": 2108 + }, + { + "epoch": 0.27, + "grad_norm": 1.1834468841552734, + "learning_rate": 8.562573867779741e-06, + "loss": 0.6035, + "step": 2109 + }, + { + "epoch": 0.27, + "grad_norm": 1.60970938205719, + "learning_rate": 8.561117602763954e-06, + "loss": 0.6716, + "step": 2110 + }, + { + "epoch": 0.27, + "grad_norm": 1.1802395582199097, + "learning_rate": 8.559660724418041e-06, + "loss": 0.5264, + "step": 2111 + }, + { + "epoch": 0.27, + "grad_norm": 3.7724010944366455, + "learning_rate": 8.558203232992923e-06, + "loss": 0.6596, + "step": 2112 + }, + { + "epoch": 0.27, + "grad_norm": 1.0022841691970825, + "learning_rate": 8.556745128739618e-06, + "loss": 0.6672, + "step": 2113 + }, + { + "epoch": 0.27, + "grad_norm": 1.4700846672058105, + "learning_rate": 8.55528641190926e-06, + "loss": 0.5608, + "step": 2114 + }, + { + "epoch": 0.27, + "grad_norm": 1.1476560831069946, + "learning_rate": 8.553827082753084e-06, + "loss": 0.5078, + "step": 2115 + }, + { + "epoch": 0.27, + "grad_norm": 1.1997826099395752, + "learning_rate": 8.552367141522423e-06, + "loss": 0.5914, + "step": 2116 + }, + { + "epoch": 0.27, + "grad_norm": 1.022222876548767, + "learning_rate": 8.550906588468728e-06, + "loss": 0.6681, + "step": 2117 + }, + { + "epoch": 0.27, + "grad_norm": 1.3832002878189087, + "learning_rate": 8.549445423843548e-06, + "loss": 0.6029, + "step": 2118 + }, + { + "epoch": 0.27, + "grad_norm": 1.0408334732055664, + "learning_rate": 8.54798364789854e-06, + "loss": 0.5953, + "step": 2119 + }, + { + "epoch": 0.27, + "grad_norm": 1.0350518226623535, + "learning_rate": 8.546521260885463e-06, + "loss": 0.5858, + "step": 2120 + }, + { + "epoch": 0.27, + "grad_norm": 1.097774624824524, + "learning_rate": 8.545058263056186e-06, + "loss": 0.6631, + "step": 2121 + }, + { + "epoch": 0.27, + "grad_norm": 1.5874125957489014, + "learning_rate": 8.543594654662677e-06, + "loss": 0.6732, + "step": 2122 + }, + { + "epoch": 0.27, + "grad_norm": 1.0040942430496216, + "learning_rate": 8.542130435957014e-06, + "loss": 0.6054, + "step": 2123 + }, + { + "epoch": 0.27, + "grad_norm": 1.172186255455017, + "learning_rate": 8.540665607191383e-06, + "loss": 0.6075, + "step": 2124 + }, + { + "epoch": 0.27, + "grad_norm": 1.285057783126831, + "learning_rate": 8.539200168618067e-06, + "loss": 0.5955, + "step": 2125 + }, + { + "epoch": 0.27, + "grad_norm": 1.245982050895691, + "learning_rate": 8.537734120489459e-06, + "loss": 0.5765, + "step": 2126 + }, + { + "epoch": 0.27, + "grad_norm": 1.9209935665130615, + "learning_rate": 8.536267463058055e-06, + "loss": 0.65, + "step": 2127 + }, + { + "epoch": 0.27, + "grad_norm": 1.3906763792037964, + "learning_rate": 8.534800196576459e-06, + "loss": 0.6005, + "step": 2128 + }, + { + "epoch": 0.27, + "grad_norm": 1.8860881328582764, + "learning_rate": 8.533332321297374e-06, + "loss": 0.6604, + "step": 2129 + }, + { + "epoch": 0.27, + "grad_norm": 1.790269374847412, + "learning_rate": 8.531863837473617e-06, + "loss": 0.6665, + "step": 2130 + }, + { + "epoch": 0.27, + "grad_norm": 1.052520990371704, + "learning_rate": 8.530394745358101e-06, + "loss": 0.5778, + "step": 2131 + }, + { + "epoch": 0.27, + "grad_norm": 1.5288801193237305, + "learning_rate": 8.528925045203847e-06, + "loss": 0.6731, + "step": 2132 + }, + { + "epoch": 0.27, + "grad_norm": 0.9974155426025391, + "learning_rate": 8.527454737263983e-06, + "loss": 0.5731, + "step": 2133 + }, + { + "epoch": 0.27, + "grad_norm": 1.3304848670959473, + "learning_rate": 8.52598382179174e-06, + "loss": 0.5752, + "step": 2134 + }, + { + "epoch": 0.27, + "grad_norm": 0.973964512348175, + "learning_rate": 8.524512299040451e-06, + "loss": 0.6316, + "step": 2135 + }, + { + "epoch": 0.27, + "grad_norm": 1.7174031734466553, + "learning_rate": 8.523040169263555e-06, + "loss": 0.6612, + "step": 2136 + }, + { + "epoch": 0.27, + "grad_norm": 1.5883331298828125, + "learning_rate": 8.5215674327146e-06, + "loss": 0.6166, + "step": 2137 + }, + { + "epoch": 0.27, + "grad_norm": 0.9867196083068848, + "learning_rate": 8.520094089647233e-06, + "loss": 0.557, + "step": 2138 + }, + { + "epoch": 0.27, + "grad_norm": 1.4658268690109253, + "learning_rate": 8.518620140315209e-06, + "loss": 0.5743, + "step": 2139 + }, + { + "epoch": 0.27, + "grad_norm": 0.9947718381881714, + "learning_rate": 8.517145584972383e-06, + "loss": 0.558, + "step": 2140 + }, + { + "epoch": 0.27, + "grad_norm": 1.1221495866775513, + "learning_rate": 8.515670423872719e-06, + "loss": 0.6239, + "step": 2141 + }, + { + "epoch": 0.27, + "grad_norm": 1.916927695274353, + "learning_rate": 8.514194657270283e-06, + "loss": 0.6223, + "step": 2142 + }, + { + "epoch": 0.27, + "grad_norm": 1.1117417812347412, + "learning_rate": 8.512718285419246e-06, + "loss": 0.5865, + "step": 2143 + }, + { + "epoch": 0.27, + "grad_norm": 1.2808547019958496, + "learning_rate": 8.511241308573884e-06, + "loss": 0.6683, + "step": 2144 + }, + { + "epoch": 0.27, + "grad_norm": 1.0311280488967896, + "learning_rate": 8.509763726988573e-06, + "loss": 0.5753, + "step": 2145 + }, + { + "epoch": 0.27, + "grad_norm": 1.4085116386413574, + "learning_rate": 8.5082855409178e-06, + "loss": 0.5724, + "step": 2146 + }, + { + "epoch": 0.28, + "grad_norm": 2.5789756774902344, + "learning_rate": 8.506806750616152e-06, + "loss": 0.5779, + "step": 2147 + }, + { + "epoch": 0.28, + "grad_norm": 1.2261545658111572, + "learning_rate": 8.505327356338318e-06, + "loss": 0.7003, + "step": 2148 + }, + { + "epoch": 0.28, + "grad_norm": 1.2848753929138184, + "learning_rate": 8.503847358339094e-06, + "loss": 0.534, + "step": 2149 + }, + { + "epoch": 0.28, + "grad_norm": 1.2150418758392334, + "learning_rate": 8.502366756873384e-06, + "loss": 0.5801, + "step": 2150 + }, + { + "epoch": 0.28, + "grad_norm": 1.183171033859253, + "learning_rate": 8.500885552196187e-06, + "loss": 0.6252, + "step": 2151 + }, + { + "epoch": 0.28, + "grad_norm": 1.2070298194885254, + "learning_rate": 8.499403744562613e-06, + "loss": 0.611, + "step": 2152 + }, + { + "epoch": 0.28, + "grad_norm": 1.6139317750930786, + "learning_rate": 8.497921334227872e-06, + "loss": 0.6351, + "step": 2153 + }, + { + "epoch": 0.28, + "grad_norm": 1.6720945835113525, + "learning_rate": 8.496438321447278e-06, + "loss": 0.6369, + "step": 2154 + }, + { + "epoch": 0.28, + "grad_norm": 1.2238283157348633, + "learning_rate": 8.49495470647625e-06, + "loss": 0.68, + "step": 2155 + }, + { + "epoch": 0.28, + "grad_norm": 1.3400514125823975, + "learning_rate": 8.493470489570314e-06, + "loss": 0.6404, + "step": 2156 + }, + { + "epoch": 0.28, + "grad_norm": 1.198529839515686, + "learning_rate": 8.491985670985093e-06, + "loss": 0.6745, + "step": 2157 + }, + { + "epoch": 0.28, + "grad_norm": 3.067074775695801, + "learning_rate": 8.490500250976314e-06, + "loss": 0.6279, + "step": 2158 + }, + { + "epoch": 0.28, + "grad_norm": 1.331629991531372, + "learning_rate": 8.489014229799816e-06, + "loss": 0.6106, + "step": 2159 + }, + { + "epoch": 0.28, + "grad_norm": 1.2290736436843872, + "learning_rate": 8.487527607711535e-06, + "loss": 0.5825, + "step": 2160 + }, + { + "epoch": 0.28, + "grad_norm": 1.1484627723693848, + "learning_rate": 8.486040384967509e-06, + "loss": 0.5505, + "step": 2161 + }, + { + "epoch": 0.28, + "grad_norm": 1.2274521589279175, + "learning_rate": 8.484552561823885e-06, + "loss": 0.6092, + "step": 2162 + }, + { + "epoch": 0.28, + "grad_norm": 1.2703810930252075, + "learning_rate": 8.483064138536906e-06, + "loss": 0.5541, + "step": 2163 + }, + { + "epoch": 0.28, + "grad_norm": 1.1794027090072632, + "learning_rate": 8.481575115362926e-06, + "loss": 0.6615, + "step": 2164 + }, + { + "epoch": 0.28, + "grad_norm": 1.208961844444275, + "learning_rate": 8.480085492558398e-06, + "loss": 0.5989, + "step": 2165 + }, + { + "epoch": 0.28, + "grad_norm": 1.370457649230957, + "learning_rate": 8.47859527037988e-06, + "loss": 0.5694, + "step": 2166 + }, + { + "epoch": 0.28, + "grad_norm": 1.082960605621338, + "learning_rate": 8.47710444908403e-06, + "loss": 0.5625, + "step": 2167 + }, + { + "epoch": 0.28, + "grad_norm": 1.1672261953353882, + "learning_rate": 8.475613028927615e-06, + "loss": 0.7696, + "step": 2168 + }, + { + "epoch": 0.28, + "grad_norm": 1.1371452808380127, + "learning_rate": 8.4741210101675e-06, + "loss": 0.5821, + "step": 2169 + }, + { + "epoch": 0.28, + "grad_norm": 1.1140472888946533, + "learning_rate": 8.472628393060654e-06, + "loss": 0.5798, + "step": 2170 + }, + { + "epoch": 0.28, + "grad_norm": 1.0004754066467285, + "learning_rate": 8.471135177864152e-06, + "loss": 0.5552, + "step": 2171 + }, + { + "epoch": 0.28, + "grad_norm": 1.2164877653121948, + "learning_rate": 8.469641364835171e-06, + "loss": 0.5519, + "step": 2172 + }, + { + "epoch": 0.28, + "grad_norm": 2.1060073375701904, + "learning_rate": 8.468146954230984e-06, + "loss": 0.7057, + "step": 2173 + }, + { + "epoch": 0.28, + "grad_norm": 1.1200610399246216, + "learning_rate": 8.466651946308979e-06, + "loss": 0.6117, + "step": 2174 + }, + { + "epoch": 0.28, + "grad_norm": 1.214603304862976, + "learning_rate": 8.465156341326639e-06, + "loss": 0.6462, + "step": 2175 + }, + { + "epoch": 0.28, + "grad_norm": 1.2249237298965454, + "learning_rate": 8.46366013954155e-06, + "loss": 0.7087, + "step": 2176 + }, + { + "epoch": 0.28, + "grad_norm": 0.9753620028495789, + "learning_rate": 8.462163341211404e-06, + "loss": 0.6127, + "step": 2177 + }, + { + "epoch": 0.28, + "grad_norm": 1.143880844116211, + "learning_rate": 8.460665946593994e-06, + "loss": 0.6184, + "step": 2178 + }, + { + "epoch": 0.28, + "grad_norm": 1.178911566734314, + "learning_rate": 8.459167955947217e-06, + "loss": 0.6362, + "step": 2179 + }, + { + "epoch": 0.28, + "grad_norm": 1.4393823146820068, + "learning_rate": 8.457669369529067e-06, + "loss": 0.7749, + "step": 2180 + }, + { + "epoch": 0.28, + "grad_norm": 2.1207737922668457, + "learning_rate": 8.456170187597647e-06, + "loss": 0.6919, + "step": 2181 + }, + { + "epoch": 0.28, + "grad_norm": 1.2718356847763062, + "learning_rate": 8.454670410411165e-06, + "loss": 0.5967, + "step": 2182 + }, + { + "epoch": 0.28, + "grad_norm": 1.3145051002502441, + "learning_rate": 8.453170038227922e-06, + "loss": 0.7233, + "step": 2183 + }, + { + "epoch": 0.28, + "grad_norm": 1.1284637451171875, + "learning_rate": 8.451669071306326e-06, + "loss": 0.629, + "step": 2184 + }, + { + "epoch": 0.28, + "grad_norm": 1.1894221305847168, + "learning_rate": 8.450167509904892e-06, + "loss": 0.5804, + "step": 2185 + }, + { + "epoch": 0.28, + "grad_norm": 1.2713896036148071, + "learning_rate": 8.448665354282233e-06, + "loss": 0.6278, + "step": 2186 + }, + { + "epoch": 0.28, + "grad_norm": 1.4796535968780518, + "learning_rate": 8.447162604697062e-06, + "loss": 0.6252, + "step": 2187 + }, + { + "epoch": 0.28, + "grad_norm": 2.35587739944458, + "learning_rate": 8.445659261408199e-06, + "loss": 0.7211, + "step": 2188 + }, + { + "epoch": 0.28, + "grad_norm": 1.649792194366455, + "learning_rate": 8.444155324674564e-06, + "loss": 0.6718, + "step": 2189 + }, + { + "epoch": 0.28, + "grad_norm": 1.4509642124176025, + "learning_rate": 8.442650794755178e-06, + "loss": 0.6138, + "step": 2190 + }, + { + "epoch": 0.28, + "grad_norm": 1.5314606428146362, + "learning_rate": 8.44114567190917e-06, + "loss": 0.6393, + "step": 2191 + }, + { + "epoch": 0.28, + "grad_norm": 1.026334524154663, + "learning_rate": 8.439639956395763e-06, + "loss": 0.5199, + "step": 2192 + }, + { + "epoch": 0.28, + "grad_norm": 1.0074406862258911, + "learning_rate": 8.438133648474284e-06, + "loss": 0.6609, + "step": 2193 + }, + { + "epoch": 0.28, + "grad_norm": 1.685986876487732, + "learning_rate": 8.43662674840417e-06, + "loss": 0.64, + "step": 2194 + }, + { + "epoch": 0.28, + "grad_norm": 1.3212658166885376, + "learning_rate": 8.435119256444948e-06, + "loss": 0.6855, + "step": 2195 + }, + { + "epoch": 0.28, + "grad_norm": 1.209119439125061, + "learning_rate": 8.433611172856258e-06, + "loss": 0.7623, + "step": 2196 + }, + { + "epoch": 0.28, + "grad_norm": 1.0640321969985962, + "learning_rate": 8.432102497897832e-06, + "loss": 0.5945, + "step": 2197 + }, + { + "epoch": 0.28, + "grad_norm": 1.2218296527862549, + "learning_rate": 8.430593231829512e-06, + "loss": 0.5931, + "step": 2198 + }, + { + "epoch": 0.28, + "grad_norm": 1.5449182987213135, + "learning_rate": 8.429083374911238e-06, + "loss": 0.5279, + "step": 2199 + }, + { + "epoch": 0.28, + "grad_norm": 1.1568078994750977, + "learning_rate": 8.427572927403049e-06, + "loss": 0.5968, + "step": 2200 + }, + { + "epoch": 0.28, + "grad_norm": 1.2066130638122559, + "learning_rate": 8.426061889565094e-06, + "loss": 0.569, + "step": 2201 + }, + { + "epoch": 0.28, + "grad_norm": 1.4190328121185303, + "learning_rate": 8.424550261657614e-06, + "loss": 0.6139, + "step": 2202 + }, + { + "epoch": 0.28, + "grad_norm": 1.1991509199142456, + "learning_rate": 8.423038043940958e-06, + "loss": 0.6717, + "step": 2203 + }, + { + "epoch": 0.28, + "grad_norm": 1.6494675874710083, + "learning_rate": 8.421525236675577e-06, + "loss": 0.5841, + "step": 2204 + }, + { + "epoch": 0.28, + "grad_norm": 1.3361454010009766, + "learning_rate": 8.420011840122016e-06, + "loss": 0.6102, + "step": 2205 + }, + { + "epoch": 0.28, + "grad_norm": 1.5138863325119019, + "learning_rate": 8.418497854540933e-06, + "loss": 0.6498, + "step": 2206 + }, + { + "epoch": 0.28, + "grad_norm": 1.3878949880599976, + "learning_rate": 8.416983280193076e-06, + "loss": 0.6866, + "step": 2207 + }, + { + "epoch": 0.28, + "grad_norm": 1.4841134548187256, + "learning_rate": 8.415468117339302e-06, + "loss": 0.6434, + "step": 2208 + }, + { + "epoch": 0.28, + "grad_norm": 1.2668837308883667, + "learning_rate": 8.413952366240565e-06, + "loss": 0.6114, + "step": 2209 + }, + { + "epoch": 0.28, + "grad_norm": 1.4674394130706787, + "learning_rate": 8.412436027157927e-06, + "loss": 0.5745, + "step": 2210 + }, + { + "epoch": 0.28, + "grad_norm": 1.2802519798278809, + "learning_rate": 8.410919100352543e-06, + "loss": 0.564, + "step": 2211 + }, + { + "epoch": 0.28, + "grad_norm": 1.2879337072372437, + "learning_rate": 8.409401586085673e-06, + "loss": 0.6732, + "step": 2212 + }, + { + "epoch": 0.28, + "grad_norm": 1.2128925323486328, + "learning_rate": 8.407883484618679e-06, + "loss": 0.6827, + "step": 2213 + }, + { + "epoch": 0.28, + "grad_norm": 1.12970769405365, + "learning_rate": 8.406364796213023e-06, + "loss": 0.6273, + "step": 2214 + }, + { + "epoch": 0.28, + "grad_norm": 1.1036797761917114, + "learning_rate": 8.404845521130268e-06, + "loss": 0.5723, + "step": 2215 + }, + { + "epoch": 0.28, + "grad_norm": 1.1636890172958374, + "learning_rate": 8.403325659632076e-06, + "loss": 0.6296, + "step": 2216 + }, + { + "epoch": 0.28, + "grad_norm": 1.6161547899246216, + "learning_rate": 8.401805211980215e-06, + "loss": 0.6374, + "step": 2217 + }, + { + "epoch": 0.28, + "grad_norm": 1.2114921808242798, + "learning_rate": 8.400284178436551e-06, + "loss": 0.6444, + "step": 2218 + }, + { + "epoch": 0.28, + "grad_norm": 1.136370062828064, + "learning_rate": 8.39876255926305e-06, + "loss": 0.6796, + "step": 2219 + }, + { + "epoch": 0.28, + "grad_norm": 10.106849670410156, + "learning_rate": 8.397240354721782e-06, + "loss": 0.6263, + "step": 2220 + }, + { + "epoch": 0.28, + "grad_norm": 1.2132163047790527, + "learning_rate": 8.395717565074913e-06, + "loss": 0.6485, + "step": 2221 + }, + { + "epoch": 0.28, + "grad_norm": 1.0989460945129395, + "learning_rate": 8.394194190584714e-06, + "loss": 0.6106, + "step": 2222 + }, + { + "epoch": 0.28, + "grad_norm": 1.1754080057144165, + "learning_rate": 8.392670231513557e-06, + "loss": 0.585, + "step": 2223 + }, + { + "epoch": 0.28, + "grad_norm": 1.3587006330490112, + "learning_rate": 8.39114568812391e-06, + "loss": 0.6329, + "step": 2224 + }, + { + "epoch": 0.29, + "grad_norm": 1.0616475343704224, + "learning_rate": 8.389620560678345e-06, + "loss": 0.6499, + "step": 2225 + }, + { + "epoch": 0.29, + "grad_norm": 1.4234185218811035, + "learning_rate": 8.388094849439536e-06, + "loss": 0.5861, + "step": 2226 + }, + { + "epoch": 0.29, + "grad_norm": 1.3070238828659058, + "learning_rate": 8.386568554670255e-06, + "loss": 0.7642, + "step": 2227 + }, + { + "epoch": 0.29, + "grad_norm": 1.2934914827346802, + "learning_rate": 8.385041676633375e-06, + "loss": 0.6081, + "step": 2228 + }, + { + "epoch": 0.29, + "grad_norm": 2.055109739303589, + "learning_rate": 8.383514215591868e-06, + "loss": 0.5796, + "step": 2229 + }, + { + "epoch": 0.29, + "grad_norm": 1.8594862222671509, + "learning_rate": 8.381986171808811e-06, + "loss": 0.5904, + "step": 2230 + }, + { + "epoch": 0.29, + "grad_norm": 1.1786482334136963, + "learning_rate": 8.380457545547378e-06, + "loss": 0.6771, + "step": 2231 + }, + { + "epoch": 0.29, + "grad_norm": 1.8738934993743896, + "learning_rate": 8.378928337070844e-06, + "loss": 0.5631, + "step": 2232 + }, + { + "epoch": 0.29, + "grad_norm": 1.595661997795105, + "learning_rate": 8.37739854664258e-06, + "loss": 0.5903, + "step": 2233 + }, + { + "epoch": 0.29, + "grad_norm": 1.2699490785598755, + "learning_rate": 8.375868174526066e-06, + "loss": 0.7472, + "step": 2234 + }, + { + "epoch": 0.29, + "grad_norm": 1.3659288883209229, + "learning_rate": 8.374337220984879e-06, + "loss": 0.5874, + "step": 2235 + }, + { + "epoch": 0.29, + "grad_norm": 1.3856401443481445, + "learning_rate": 8.372805686282688e-06, + "loss": 0.5405, + "step": 2236 + }, + { + "epoch": 0.29, + "grad_norm": 1.9804805517196655, + "learning_rate": 8.371273570683273e-06, + "loss": 0.6157, + "step": 2237 + }, + { + "epoch": 0.29, + "grad_norm": 1.1596951484680176, + "learning_rate": 8.369740874450511e-06, + "loss": 0.6362, + "step": 2238 + }, + { + "epoch": 0.29, + "grad_norm": 1.279098391532898, + "learning_rate": 8.368207597848375e-06, + "loss": 0.6012, + "step": 2239 + }, + { + "epoch": 0.29, + "grad_norm": 1.4010846614837646, + "learning_rate": 8.36667374114094e-06, + "loss": 0.5266, + "step": 2240 + }, + { + "epoch": 0.29, + "grad_norm": 0.9738383293151855, + "learning_rate": 8.365139304592384e-06, + "loss": 0.6336, + "step": 2241 + }, + { + "epoch": 0.29, + "grad_norm": 1.3508878946304321, + "learning_rate": 8.363604288466984e-06, + "loss": 0.6368, + "step": 2242 + }, + { + "epoch": 0.29, + "grad_norm": 1.1931465864181519, + "learning_rate": 8.362068693029111e-06, + "loss": 0.5803, + "step": 2243 + }, + { + "epoch": 0.29, + "grad_norm": 1.0445038080215454, + "learning_rate": 8.360532518543241e-06, + "loss": 0.5703, + "step": 2244 + }, + { + "epoch": 0.29, + "grad_norm": 1.1057722568511963, + "learning_rate": 8.358995765273953e-06, + "loss": 0.6061, + "step": 2245 + }, + { + "epoch": 0.29, + "grad_norm": 2.1264703273773193, + "learning_rate": 8.357458433485917e-06, + "loss": 0.6588, + "step": 2246 + }, + { + "epoch": 0.29, + "grad_norm": 1.4543817043304443, + "learning_rate": 8.355920523443909e-06, + "loss": 0.5947, + "step": 2247 + }, + { + "epoch": 0.29, + "grad_norm": 0.9155209064483643, + "learning_rate": 8.354382035412803e-06, + "loss": 0.5496, + "step": 2248 + }, + { + "epoch": 0.29, + "grad_norm": 1.251805067062378, + "learning_rate": 8.35284296965757e-06, + "loss": 0.5388, + "step": 2249 + }, + { + "epoch": 0.29, + "grad_norm": 1.2307274341583252, + "learning_rate": 8.351303326443287e-06, + "loss": 0.6713, + "step": 2250 + }, + { + "epoch": 0.29, + "grad_norm": 1.3618934154510498, + "learning_rate": 8.349763106035123e-06, + "loss": 0.6442, + "step": 2251 + }, + { + "epoch": 0.29, + "grad_norm": 1.1276440620422363, + "learning_rate": 8.348222308698348e-06, + "loss": 0.6474, + "step": 2252 + }, + { + "epoch": 0.29, + "grad_norm": 1.5264719724655151, + "learning_rate": 8.34668093469834e-06, + "loss": 0.623, + "step": 2253 + }, + { + "epoch": 0.29, + "grad_norm": 1.4504443407058716, + "learning_rate": 8.34513898430056e-06, + "loss": 0.6397, + "step": 2254 + }, + { + "epoch": 0.29, + "grad_norm": 1.4074475765228271, + "learning_rate": 8.343596457770586e-06, + "loss": 0.6265, + "step": 2255 + }, + { + "epoch": 0.29, + "grad_norm": 1.2215962409973145, + "learning_rate": 8.342053355374082e-06, + "loss": 0.708, + "step": 2256 + }, + { + "epoch": 0.29, + "grad_norm": 1.6315170526504517, + "learning_rate": 8.340509677376817e-06, + "loss": 0.6566, + "step": 2257 + }, + { + "epoch": 0.29, + "grad_norm": 1.2518130540847778, + "learning_rate": 8.338965424044658e-06, + "loss": 0.7536, + "step": 2258 + }, + { + "epoch": 0.29, + "grad_norm": 1.4042388200759888, + "learning_rate": 8.337420595643574e-06, + "loss": 0.6747, + "step": 2259 + }, + { + "epoch": 0.29, + "grad_norm": 1.046313762664795, + "learning_rate": 8.335875192439627e-06, + "loss": 0.6222, + "step": 2260 + }, + { + "epoch": 0.29, + "grad_norm": 1.1476646661758423, + "learning_rate": 8.33432921469898e-06, + "loss": 0.6875, + "step": 2261 + }, + { + "epoch": 0.29, + "grad_norm": 1.1270971298217773, + "learning_rate": 8.332782662687902e-06, + "loss": 0.5926, + "step": 2262 + }, + { + "epoch": 0.29, + "grad_norm": 1.2763153314590454, + "learning_rate": 8.331235536672748e-06, + "loss": 0.5782, + "step": 2263 + }, + { + "epoch": 0.29, + "grad_norm": 1.3246185779571533, + "learning_rate": 8.329687836919986e-06, + "loss": 0.6623, + "step": 2264 + }, + { + "epoch": 0.29, + "grad_norm": 1.4971126317977905, + "learning_rate": 8.328139563696172e-06, + "loss": 0.6468, + "step": 2265 + }, + { + "epoch": 0.29, + "grad_norm": 1.484387993812561, + "learning_rate": 8.326590717267964e-06, + "loss": 0.6747, + "step": 2266 + }, + { + "epoch": 0.29, + "grad_norm": 1.130797266960144, + "learning_rate": 8.32504129790212e-06, + "loss": 0.6821, + "step": 2267 + }, + { + "epoch": 0.29, + "grad_norm": 1.0058528184890747, + "learning_rate": 8.323491305865498e-06, + "loss": 0.5863, + "step": 2268 + }, + { + "epoch": 0.29, + "grad_norm": 1.406995415687561, + "learning_rate": 8.321940741425049e-06, + "loss": 0.6819, + "step": 2269 + }, + { + "epoch": 0.29, + "grad_norm": 1.8498761653900146, + "learning_rate": 8.32038960484783e-06, + "loss": 0.7176, + "step": 2270 + }, + { + "epoch": 0.29, + "grad_norm": 1.4135112762451172, + "learning_rate": 8.31883789640099e-06, + "loss": 0.5951, + "step": 2271 + }, + { + "epoch": 0.29, + "grad_norm": 1.5905379056930542, + "learning_rate": 8.317285616351782e-06, + "loss": 0.7059, + "step": 2272 + }, + { + "epoch": 0.29, + "grad_norm": 1.0527902841567993, + "learning_rate": 8.315732764967552e-06, + "loss": 0.6129, + "step": 2273 + }, + { + "epoch": 0.29, + "grad_norm": 1.336744785308838, + "learning_rate": 8.314179342515746e-06, + "loss": 0.6918, + "step": 2274 + }, + { + "epoch": 0.29, + "grad_norm": 1.7440773248672485, + "learning_rate": 8.312625349263914e-06, + "loss": 0.6386, + "step": 2275 + }, + { + "epoch": 0.29, + "grad_norm": 1.086266279220581, + "learning_rate": 8.311070785479699e-06, + "loss": 0.6303, + "step": 2276 + }, + { + "epoch": 0.29, + "grad_norm": 1.1209901571273804, + "learning_rate": 8.309515651430837e-06, + "loss": 0.6507, + "step": 2277 + }, + { + "epoch": 0.29, + "grad_norm": 1.4283497333526611, + "learning_rate": 8.307959947385174e-06, + "loss": 0.6424, + "step": 2278 + }, + { + "epoch": 0.29, + "grad_norm": 1.2685678005218506, + "learning_rate": 8.306403673610646e-06, + "loss": 0.5953, + "step": 2279 + }, + { + "epoch": 0.29, + "grad_norm": 1.394755482673645, + "learning_rate": 8.304846830375294e-06, + "loss": 0.5581, + "step": 2280 + }, + { + "epoch": 0.29, + "grad_norm": 1.204455852508545, + "learning_rate": 8.303289417947244e-06, + "loss": 0.5786, + "step": 2281 + }, + { + "epoch": 0.29, + "grad_norm": 1.3802586793899536, + "learning_rate": 8.301731436594737e-06, + "loss": 0.5582, + "step": 2282 + }, + { + "epoch": 0.29, + "grad_norm": 1.6759288311004639, + "learning_rate": 8.3001728865861e-06, + "loss": 0.6505, + "step": 2283 + }, + { + "epoch": 0.29, + "grad_norm": 1.07804536819458, + "learning_rate": 8.298613768189761e-06, + "loss": 0.6687, + "step": 2284 + }, + { + "epoch": 0.29, + "grad_norm": 1.4512630701065063, + "learning_rate": 8.297054081674247e-06, + "loss": 0.6112, + "step": 2285 + }, + { + "epoch": 0.29, + "grad_norm": 2.449326753616333, + "learning_rate": 8.29549382730818e-06, + "loss": 0.628, + "step": 2286 + }, + { + "epoch": 0.29, + "grad_norm": 1.163280725479126, + "learning_rate": 8.293933005360288e-06, + "loss": 0.6082, + "step": 2287 + }, + { + "epoch": 0.29, + "grad_norm": 1.181201696395874, + "learning_rate": 8.292371616099388e-06, + "loss": 0.6441, + "step": 2288 + }, + { + "epoch": 0.29, + "grad_norm": 1.1229546070098877, + "learning_rate": 8.290809659794397e-06, + "loss": 0.6509, + "step": 2289 + }, + { + "epoch": 0.29, + "grad_norm": 1.2174752950668335, + "learning_rate": 8.289247136714328e-06, + "loss": 0.6646, + "step": 2290 + }, + { + "epoch": 0.29, + "grad_norm": 1.106174111366272, + "learning_rate": 8.287684047128298e-06, + "loss": 0.6211, + "step": 2291 + }, + { + "epoch": 0.29, + "grad_norm": 1.5392602682113647, + "learning_rate": 8.286120391305515e-06, + "loss": 0.5977, + "step": 2292 + }, + { + "epoch": 0.29, + "grad_norm": 1.2090442180633545, + "learning_rate": 8.284556169515287e-06, + "loss": 0.6219, + "step": 2293 + }, + { + "epoch": 0.29, + "grad_norm": 1.6874985694885254, + "learning_rate": 8.282991382027022e-06, + "loss": 0.6698, + "step": 2294 + }, + { + "epoch": 0.29, + "grad_norm": 1.1028480529785156, + "learning_rate": 8.281426029110218e-06, + "loss": 0.6597, + "step": 2295 + }, + { + "epoch": 0.29, + "grad_norm": 1.1251074075698853, + "learning_rate": 8.279860111034478e-06, + "loss": 0.6606, + "step": 2296 + }, + { + "epoch": 0.29, + "grad_norm": 1.540422797203064, + "learning_rate": 8.278293628069502e-06, + "loss": 0.6238, + "step": 2297 + }, + { + "epoch": 0.29, + "grad_norm": 1.1604392528533936, + "learning_rate": 8.276726580485082e-06, + "loss": 0.6685, + "step": 2298 + }, + { + "epoch": 0.29, + "grad_norm": 1.4422894716262817, + "learning_rate": 8.27515896855111e-06, + "loss": 0.6697, + "step": 2299 + }, + { + "epoch": 0.29, + "grad_norm": 1.4221534729003906, + "learning_rate": 8.273590792537574e-06, + "loss": 0.6167, + "step": 2300 + }, + { + "epoch": 0.29, + "grad_norm": 1.291616678237915, + "learning_rate": 8.272022052714563e-06, + "loss": 0.6258, + "step": 2301 + }, + { + "epoch": 0.29, + "grad_norm": 0.9711089134216309, + "learning_rate": 8.270452749352261e-06, + "loss": 0.5395, + "step": 2302 + }, + { + "epoch": 0.3, + "grad_norm": 1.145838975906372, + "learning_rate": 8.268882882720946e-06, + "loss": 0.5787, + "step": 2303 + }, + { + "epoch": 0.3, + "grad_norm": 1.0817339420318604, + "learning_rate": 8.267312453090997e-06, + "loss": 0.6168, + "step": 2304 + }, + { + "epoch": 0.3, + "grad_norm": 1.2782442569732666, + "learning_rate": 8.26574146073289e-06, + "loss": 0.673, + "step": 2305 + }, + { + "epoch": 0.3, + "grad_norm": 1.1187020540237427, + "learning_rate": 8.264169905917194e-06, + "loss": 0.6032, + "step": 2306 + }, + { + "epoch": 0.3, + "grad_norm": 1.3407741785049438, + "learning_rate": 8.262597788914579e-06, + "loss": 0.6716, + "step": 2307 + }, + { + "epoch": 0.3, + "grad_norm": 1.4748417139053345, + "learning_rate": 8.26102510999581e-06, + "loss": 0.6684, + "step": 2308 + }, + { + "epoch": 0.3, + "grad_norm": 1.8912734985351562, + "learning_rate": 8.259451869431746e-06, + "loss": 0.5978, + "step": 2309 + }, + { + "epoch": 0.3, + "grad_norm": 0.9286829233169556, + "learning_rate": 8.257878067493351e-06, + "loss": 0.6408, + "step": 2310 + }, + { + "epoch": 0.3, + "grad_norm": 1.2534960508346558, + "learning_rate": 8.256303704451679e-06, + "loss": 0.5862, + "step": 2311 + }, + { + "epoch": 0.3, + "grad_norm": 1.3470648527145386, + "learning_rate": 8.254728780577877e-06, + "loss": 0.6486, + "step": 2312 + }, + { + "epoch": 0.3, + "grad_norm": 1.1892026662826538, + "learning_rate": 8.253153296143199e-06, + "loss": 0.6312, + "step": 2313 + }, + { + "epoch": 0.3, + "grad_norm": 2.13631534576416, + "learning_rate": 8.251577251418987e-06, + "loss": 0.5599, + "step": 2314 + }, + { + "epoch": 0.3, + "grad_norm": 1.4602019786834717, + "learning_rate": 8.250000646676688e-06, + "loss": 0.638, + "step": 2315 + }, + { + "epoch": 0.3, + "grad_norm": 1.0925565958023071, + "learning_rate": 8.248423482187832e-06, + "loss": 0.6265, + "step": 2316 + }, + { + "epoch": 0.3, + "grad_norm": 1.1063237190246582, + "learning_rate": 8.246845758224062e-06, + "loss": 0.6283, + "step": 2317 + }, + { + "epoch": 0.3, + "grad_norm": 1.2904975414276123, + "learning_rate": 8.245267475057102e-06, + "loss": 0.6167, + "step": 2318 + }, + { + "epoch": 0.3, + "grad_norm": 1.1062041521072388, + "learning_rate": 8.243688632958783e-06, + "loss": 0.6804, + "step": 2319 + }, + { + "epoch": 0.3, + "grad_norm": 1.2432068586349487, + "learning_rate": 8.242109232201028e-06, + "loss": 0.6152, + "step": 2320 + }, + { + "epoch": 0.3, + "grad_norm": 1.199753761291504, + "learning_rate": 8.240529273055852e-06, + "loss": 0.6969, + "step": 2321 + }, + { + "epoch": 0.3, + "grad_norm": 2.155482769012451, + "learning_rate": 8.238948755795379e-06, + "loss": 0.69, + "step": 2322 + }, + { + "epoch": 0.3, + "grad_norm": 1.2352306842803955, + "learning_rate": 8.237367680691817e-06, + "loss": 0.6873, + "step": 2323 + }, + { + "epoch": 0.3, + "grad_norm": 1.3099703788757324, + "learning_rate": 8.235786048017473e-06, + "loss": 0.6747, + "step": 2324 + }, + { + "epoch": 0.3, + "grad_norm": 1.2565714120864868, + "learning_rate": 8.234203858044751e-06, + "loss": 0.6278, + "step": 2325 + }, + { + "epoch": 0.3, + "grad_norm": 1.7045984268188477, + "learning_rate": 8.232621111046154e-06, + "loss": 0.6427, + "step": 2326 + }, + { + "epoch": 0.3, + "grad_norm": 1.1133400201797485, + "learning_rate": 8.231037807294275e-06, + "loss": 0.6679, + "step": 2327 + }, + { + "epoch": 0.3, + "grad_norm": 1.3851282596588135, + "learning_rate": 8.229453947061807e-06, + "loss": 0.6312, + "step": 2328 + }, + { + "epoch": 0.3, + "grad_norm": 1.1454262733459473, + "learning_rate": 8.227869530621538e-06, + "loss": 0.6135, + "step": 2329 + }, + { + "epoch": 0.3, + "grad_norm": 1.3275219202041626, + "learning_rate": 8.226284558246351e-06, + "loss": 0.6552, + "step": 2330 + }, + { + "epoch": 0.3, + "grad_norm": 1.5105386972427368, + "learning_rate": 8.224699030209227e-06, + "loss": 0.6731, + "step": 2331 + }, + { + "epoch": 0.3, + "grad_norm": 1.1528339385986328, + "learning_rate": 8.223112946783237e-06, + "loss": 0.5775, + "step": 2332 + }, + { + "epoch": 0.3, + "grad_norm": 1.2737232446670532, + "learning_rate": 8.221526308241556e-06, + "loss": 0.5996, + "step": 2333 + }, + { + "epoch": 0.3, + "grad_norm": 1.409363031387329, + "learning_rate": 8.219939114857446e-06, + "loss": 0.6846, + "step": 2334 + }, + { + "epoch": 0.3, + "grad_norm": 1.1390594244003296, + "learning_rate": 8.218351366904273e-06, + "loss": 0.6503, + "step": 2335 + }, + { + "epoch": 0.3, + "grad_norm": 1.7003358602523804, + "learning_rate": 8.216763064655493e-06, + "loss": 0.6394, + "step": 2336 + }, + { + "epoch": 0.3, + "grad_norm": 1.6137683391571045, + "learning_rate": 8.215174208384658e-06, + "loss": 0.5789, + "step": 2337 + }, + { + "epoch": 0.3, + "grad_norm": 1.193477988243103, + "learning_rate": 8.213584798365416e-06, + "loss": 0.5051, + "step": 2338 + }, + { + "epoch": 0.3, + "grad_norm": 1.2043871879577637, + "learning_rate": 8.211994834871511e-06, + "loss": 0.5932, + "step": 2339 + }, + { + "epoch": 0.3, + "grad_norm": 1.538806676864624, + "learning_rate": 8.21040431817678e-06, + "loss": 0.6492, + "step": 2340 + }, + { + "epoch": 0.3, + "grad_norm": 1.2634645700454712, + "learning_rate": 8.208813248555163e-06, + "loss": 0.6181, + "step": 2341 + }, + { + "epoch": 0.3, + "grad_norm": 1.2859795093536377, + "learning_rate": 8.207221626280683e-06, + "loss": 0.6666, + "step": 2342 + }, + { + "epoch": 0.3, + "grad_norm": 1.1655632257461548, + "learning_rate": 8.205629451627469e-06, + "loss": 0.6052, + "step": 2343 + }, + { + "epoch": 0.3, + "grad_norm": 6.354539394378662, + "learning_rate": 8.204036724869737e-06, + "loss": 0.6652, + "step": 2344 + }, + { + "epoch": 0.3, + "grad_norm": 1.6453803777694702, + "learning_rate": 8.202443446281804e-06, + "loss": 0.5975, + "step": 2345 + }, + { + "epoch": 0.3, + "grad_norm": 1.1627188920974731, + "learning_rate": 8.20084961613808e-06, + "loss": 0.6252, + "step": 2346 + }, + { + "epoch": 0.3, + "grad_norm": 0.9727002382278442, + "learning_rate": 8.199255234713068e-06, + "loss": 0.5488, + "step": 2347 + }, + { + "epoch": 0.3, + "grad_norm": 1.367080807685852, + "learning_rate": 8.197660302281371e-06, + "loss": 0.59, + "step": 2348 + }, + { + "epoch": 0.3, + "grad_norm": 1.4162267446517944, + "learning_rate": 8.196064819117681e-06, + "loss": 0.6033, + "step": 2349 + }, + { + "epoch": 0.3, + "grad_norm": 1.5463422536849976, + "learning_rate": 8.194468785496788e-06, + "loss": 0.6141, + "step": 2350 + }, + { + "epoch": 0.3, + "grad_norm": 1.220804214477539, + "learning_rate": 8.192872201693575e-06, + "loss": 0.6256, + "step": 2351 + }, + { + "epoch": 0.3, + "grad_norm": 1.263001799583435, + "learning_rate": 8.191275067983026e-06, + "loss": 0.6838, + "step": 2352 + }, + { + "epoch": 0.3, + "grad_norm": 0.9969754815101624, + "learning_rate": 8.189677384640212e-06, + "loss": 0.6114, + "step": 2353 + }, + { + "epoch": 0.3, + "grad_norm": 1.2792723178863525, + "learning_rate": 8.188079151940299e-06, + "loss": 0.6809, + "step": 2354 + }, + { + "epoch": 0.3, + "grad_norm": 1.3698155879974365, + "learning_rate": 8.186480370158552e-06, + "loss": 0.6168, + "step": 2355 + }, + { + "epoch": 0.3, + "grad_norm": 1.3380119800567627, + "learning_rate": 8.18488103957033e-06, + "loss": 0.6018, + "step": 2356 + }, + { + "epoch": 0.3, + "grad_norm": 1.2656079530715942, + "learning_rate": 8.183281160451083e-06, + "loss": 0.5869, + "step": 2357 + }, + { + "epoch": 0.3, + "grad_norm": 1.461011290550232, + "learning_rate": 8.181680733076359e-06, + "loss": 0.5784, + "step": 2358 + }, + { + "epoch": 0.3, + "grad_norm": 1.3184751272201538, + "learning_rate": 8.180079757721799e-06, + "loss": 0.6599, + "step": 2359 + }, + { + "epoch": 0.3, + "grad_norm": 1.169732689857483, + "learning_rate": 8.178478234663139e-06, + "loss": 0.6553, + "step": 2360 + }, + { + "epoch": 0.3, + "grad_norm": 1.2968778610229492, + "learning_rate": 8.176876164176206e-06, + "loss": 0.669, + "step": 2361 + }, + { + "epoch": 0.3, + "grad_norm": 1.166947603225708, + "learning_rate": 8.175273546536929e-06, + "loss": 0.6507, + "step": 2362 + }, + { + "epoch": 0.3, + "grad_norm": 1.2829275131225586, + "learning_rate": 8.17367038202132e-06, + "loss": 0.5774, + "step": 2363 + }, + { + "epoch": 0.3, + "grad_norm": 1.1029894351959229, + "learning_rate": 8.172066670905498e-06, + "loss": 0.6448, + "step": 2364 + }, + { + "epoch": 0.3, + "grad_norm": 1.3434944152832031, + "learning_rate": 8.170462413465666e-06, + "loss": 0.6137, + "step": 2365 + }, + { + "epoch": 0.3, + "grad_norm": 1.5390658378601074, + "learning_rate": 8.168857609978125e-06, + "loss": 0.647, + "step": 2366 + }, + { + "epoch": 0.3, + "grad_norm": 1.3299118280410767, + "learning_rate": 8.16725226071927e-06, + "loss": 0.6634, + "step": 2367 + }, + { + "epoch": 0.3, + "grad_norm": 1.6107890605926514, + "learning_rate": 8.165646365965589e-06, + "loss": 0.5927, + "step": 2368 + }, + { + "epoch": 0.3, + "grad_norm": 1.2683249711990356, + "learning_rate": 8.164039925993667e-06, + "loss": 0.6726, + "step": 2369 + }, + { + "epoch": 0.3, + "grad_norm": 1.6438405513763428, + "learning_rate": 8.162432941080178e-06, + "loss": 0.619, + "step": 2370 + }, + { + "epoch": 0.3, + "grad_norm": 1.1879276037216187, + "learning_rate": 8.160825411501896e-06, + "loss": 0.5904, + "step": 2371 + }, + { + "epoch": 0.3, + "grad_norm": 1.1308679580688477, + "learning_rate": 8.159217337535682e-06, + "loss": 0.6671, + "step": 2372 + }, + { + "epoch": 0.3, + "grad_norm": 1.3544384241104126, + "learning_rate": 8.157608719458493e-06, + "loss": 0.5496, + "step": 2373 + }, + { + "epoch": 0.3, + "grad_norm": 6.0907440185546875, + "learning_rate": 8.155999557547384e-06, + "loss": 0.6856, + "step": 2374 + }, + { + "epoch": 0.3, + "grad_norm": 1.3868075609207153, + "learning_rate": 8.154389852079501e-06, + "loss": 0.6667, + "step": 2375 + }, + { + "epoch": 0.3, + "grad_norm": 1.0029851198196411, + "learning_rate": 8.15277960333208e-06, + "loss": 0.6324, + "step": 2376 + }, + { + "epoch": 0.3, + "grad_norm": 1.873964548110962, + "learning_rate": 8.151168811582455e-06, + "loss": 0.6404, + "step": 2377 + }, + { + "epoch": 0.3, + "grad_norm": 1.0714986324310303, + "learning_rate": 8.149557477108051e-06, + "loss": 0.6066, + "step": 2378 + }, + { + "epoch": 0.3, + "grad_norm": 1.5956631898880005, + "learning_rate": 8.147945600186391e-06, + "loss": 0.6414, + "step": 2379 + }, + { + "epoch": 0.3, + "grad_norm": 1.2539238929748535, + "learning_rate": 8.146333181095086e-06, + "loss": 0.5592, + "step": 2380 + }, + { + "epoch": 0.31, + "grad_norm": 1.2881016731262207, + "learning_rate": 8.14472022011184e-06, + "loss": 0.5886, + "step": 2381 + }, + { + "epoch": 0.31, + "grad_norm": 3.226897954940796, + "learning_rate": 8.143106717514455e-06, + "loss": 0.7155, + "step": 2382 + }, + { + "epoch": 0.31, + "grad_norm": 1.4589011669158936, + "learning_rate": 8.141492673580825e-06, + "loss": 0.7016, + "step": 2383 + }, + { + "epoch": 0.31, + "grad_norm": 1.2750200033187866, + "learning_rate": 8.139878088588934e-06, + "loss": 0.6133, + "step": 2384 + }, + { + "epoch": 0.31, + "grad_norm": 1.2676771879196167, + "learning_rate": 8.138262962816865e-06, + "loss": 0.6297, + "step": 2385 + }, + { + "epoch": 0.31, + "grad_norm": 1.572501301765442, + "learning_rate": 8.136647296542787e-06, + "loss": 0.6115, + "step": 2386 + }, + { + "epoch": 0.31, + "grad_norm": 1.544345736503601, + "learning_rate": 8.135031090044966e-06, + "loss": 0.6698, + "step": 2387 + }, + { + "epoch": 0.31, + "grad_norm": 1.247854232788086, + "learning_rate": 8.133414343601762e-06, + "loss": 0.6356, + "step": 2388 + }, + { + "epoch": 0.31, + "grad_norm": 1.2449547052383423, + "learning_rate": 8.131797057491627e-06, + "loss": 0.6794, + "step": 2389 + }, + { + "epoch": 0.31, + "grad_norm": 1.5108448266983032, + "learning_rate": 8.130179231993105e-06, + "loss": 0.6564, + "step": 2390 + }, + { + "epoch": 0.31, + "grad_norm": 1.2895339727401733, + "learning_rate": 8.128560867384832e-06, + "loss": 0.5596, + "step": 2391 + }, + { + "epoch": 0.31, + "grad_norm": 1.3409855365753174, + "learning_rate": 8.126941963945541e-06, + "loss": 0.6147, + "step": 2392 + }, + { + "epoch": 0.31, + "grad_norm": 1.2308881282806396, + "learning_rate": 8.125322521954055e-06, + "loss": 0.5847, + "step": 2393 + }, + { + "epoch": 0.31, + "grad_norm": 1.4125090837478638, + "learning_rate": 8.12370254168929e-06, + "loss": 0.6779, + "step": 2394 + }, + { + "epoch": 0.31, + "grad_norm": 1.1895586252212524, + "learning_rate": 8.122082023430251e-06, + "loss": 0.7764, + "step": 2395 + }, + { + "epoch": 0.31, + "grad_norm": 1.083396077156067, + "learning_rate": 8.120460967456043e-06, + "loss": 0.6141, + "step": 2396 + }, + { + "epoch": 0.31, + "grad_norm": 1.1598103046417236, + "learning_rate": 8.118839374045861e-06, + "loss": 0.612, + "step": 2397 + }, + { + "epoch": 0.31, + "grad_norm": 1.1906448602676392, + "learning_rate": 8.117217243478988e-06, + "loss": 0.6277, + "step": 2398 + }, + { + "epoch": 0.31, + "grad_norm": 1.1712722778320312, + "learning_rate": 8.115594576034804e-06, + "loss": 0.7004, + "step": 2399 + }, + { + "epoch": 0.31, + "grad_norm": 1.3918662071228027, + "learning_rate": 8.113971371992782e-06, + "loss": 0.5189, + "step": 2400 + }, + { + "epoch": 0.31, + "grad_norm": 1.191245436668396, + "learning_rate": 8.112347631632484e-06, + "loss": 0.6203, + "step": 2401 + }, + { + "epoch": 0.31, + "grad_norm": 1.500771403312683, + "learning_rate": 8.11072335523357e-06, + "loss": 0.5052, + "step": 2402 + }, + { + "epoch": 0.31, + "grad_norm": 1.3032631874084473, + "learning_rate": 8.109098543075784e-06, + "loss": 0.6056, + "step": 2403 + }, + { + "epoch": 0.31, + "grad_norm": 1.1832846403121948, + "learning_rate": 8.107473195438969e-06, + "loss": 0.6169, + "step": 2404 + }, + { + "epoch": 0.31, + "grad_norm": 1.2408926486968994, + "learning_rate": 8.105847312603057e-06, + "loss": 0.5983, + "step": 2405 + }, + { + "epoch": 0.31, + "grad_norm": 1.0566129684448242, + "learning_rate": 8.104220894848073e-06, + "loss": 0.6529, + "step": 2406 + }, + { + "epoch": 0.31, + "grad_norm": 1.521499752998352, + "learning_rate": 8.102593942454138e-06, + "loss": 0.6247, + "step": 2407 + }, + { + "epoch": 0.31, + "grad_norm": 1.2609105110168457, + "learning_rate": 8.100966455701458e-06, + "loss": 0.551, + "step": 2408 + }, + { + "epoch": 0.31, + "grad_norm": 1.3678559064865112, + "learning_rate": 8.099338434870336e-06, + "loss": 0.6309, + "step": 2409 + }, + { + "epoch": 0.31, + "grad_norm": 1.0221734046936035, + "learning_rate": 8.097709880241165e-06, + "loss": 0.5807, + "step": 2410 + }, + { + "epoch": 0.31, + "grad_norm": 1.6654701232910156, + "learning_rate": 8.09608079209443e-06, + "loss": 0.6289, + "step": 2411 + }, + { + "epoch": 0.31, + "grad_norm": 1.2061890363693237, + "learning_rate": 8.094451170710708e-06, + "loss": 0.6315, + "step": 2412 + }, + { + "epoch": 0.31, + "grad_norm": 1.0013035535812378, + "learning_rate": 8.09282101637067e-06, + "loss": 0.5325, + "step": 2413 + }, + { + "epoch": 0.31, + "grad_norm": 3.319288730621338, + "learning_rate": 8.091190329355076e-06, + "loss": 0.6432, + "step": 2414 + }, + { + "epoch": 0.31, + "grad_norm": 1.1914440393447876, + "learning_rate": 8.089559109944777e-06, + "loss": 0.5529, + "step": 2415 + }, + { + "epoch": 0.31, + "grad_norm": 1.162363886833191, + "learning_rate": 8.087927358420723e-06, + "loss": 0.6628, + "step": 2416 + }, + { + "epoch": 0.31, + "grad_norm": 1.818101406097412, + "learning_rate": 8.086295075063942e-06, + "loss": 0.6276, + "step": 2417 + }, + { + "epoch": 0.31, + "grad_norm": 1.2434170246124268, + "learning_rate": 8.084662260155567e-06, + "loss": 0.6482, + "step": 2418 + }, + { + "epoch": 0.31, + "grad_norm": 0.9897717833518982, + "learning_rate": 8.083028913976816e-06, + "loss": 0.6063, + "step": 2419 + }, + { + "epoch": 0.31, + "grad_norm": 1.3943613767623901, + "learning_rate": 8.081395036808999e-06, + "loss": 0.6482, + "step": 2420 + }, + { + "epoch": 0.31, + "grad_norm": 1.2112592458724976, + "learning_rate": 8.079760628933518e-06, + "loss": 0.6589, + "step": 2421 + }, + { + "epoch": 0.31, + "grad_norm": 1.2801096439361572, + "learning_rate": 8.078125690631868e-06, + "loss": 0.5905, + "step": 2422 + }, + { + "epoch": 0.31, + "grad_norm": 1.4379853010177612, + "learning_rate": 8.076490222185631e-06, + "loss": 0.5817, + "step": 2423 + }, + { + "epoch": 0.31, + "grad_norm": 1.9312883615493774, + "learning_rate": 8.074854223876487e-06, + "loss": 0.6578, + "step": 2424 + }, + { + "epoch": 0.31, + "grad_norm": 3.0586395263671875, + "learning_rate": 8.073217695986203e-06, + "loss": 0.5962, + "step": 2425 + }, + { + "epoch": 0.31, + "grad_norm": 2.1898674964904785, + "learning_rate": 8.071580638796634e-06, + "loss": 0.6595, + "step": 2426 + }, + { + "epoch": 0.31, + "grad_norm": 1.2980766296386719, + "learning_rate": 8.069943052589734e-06, + "loss": 0.6066, + "step": 2427 + }, + { + "epoch": 0.31, + "grad_norm": 1.2575178146362305, + "learning_rate": 8.068304937647542e-06, + "loss": 0.5541, + "step": 2428 + }, + { + "epoch": 0.31, + "grad_norm": 1.1975334882736206, + "learning_rate": 8.066666294252189e-06, + "loss": 0.6744, + "step": 2429 + }, + { + "epoch": 0.31, + "grad_norm": 1.5488165616989136, + "learning_rate": 8.0650271226859e-06, + "loss": 0.6803, + "step": 2430 + }, + { + "epoch": 0.31, + "grad_norm": 1.2515982389450073, + "learning_rate": 8.063387423230987e-06, + "loss": 0.6455, + "step": 2431 + }, + { + "epoch": 0.31, + "grad_norm": 1.3792338371276855, + "learning_rate": 8.061747196169855e-06, + "loss": 0.5893, + "step": 2432 + }, + { + "epoch": 0.31, + "grad_norm": 1.6035951375961304, + "learning_rate": 8.060106441785003e-06, + "loss": 0.5758, + "step": 2433 + }, + { + "epoch": 0.31, + "grad_norm": 2.494800090789795, + "learning_rate": 8.058465160359016e-06, + "loss": 0.6529, + "step": 2434 + }, + { + "epoch": 0.31, + "grad_norm": 1.2441433668136597, + "learning_rate": 8.05682335217457e-06, + "loss": 0.5876, + "step": 2435 + }, + { + "epoch": 0.31, + "grad_norm": 1.5442471504211426, + "learning_rate": 8.055181017514432e-06, + "loss": 0.6346, + "step": 2436 + }, + { + "epoch": 0.31, + "grad_norm": 1.3310129642486572, + "learning_rate": 8.053538156661465e-06, + "loss": 0.6639, + "step": 2437 + }, + { + "epoch": 0.31, + "grad_norm": 1.222756028175354, + "learning_rate": 8.051894769898615e-06, + "loss": 0.6473, + "step": 2438 + }, + { + "epoch": 0.31, + "grad_norm": 1.3063236474990845, + "learning_rate": 8.050250857508923e-06, + "loss": 0.7353, + "step": 2439 + }, + { + "epoch": 0.31, + "grad_norm": 1.1706254482269287, + "learning_rate": 8.04860641977552e-06, + "loss": 0.6611, + "step": 2440 + }, + { + "epoch": 0.31, + "grad_norm": 1.5362704992294312, + "learning_rate": 8.046961456981625e-06, + "loss": 0.6244, + "step": 2441 + }, + { + "epoch": 0.31, + "grad_norm": 1.1532371044158936, + "learning_rate": 8.045315969410551e-06, + "loss": 0.6642, + "step": 2442 + }, + { + "epoch": 0.31, + "grad_norm": 1.4653925895690918, + "learning_rate": 8.043669957345701e-06, + "loss": 0.6008, + "step": 2443 + }, + { + "epoch": 0.31, + "grad_norm": 2.7584710121154785, + "learning_rate": 8.042023421070566e-06, + "loss": 0.6796, + "step": 2444 + }, + { + "epoch": 0.31, + "grad_norm": 1.38533616065979, + "learning_rate": 8.040376360868727e-06, + "loss": 0.7932, + "step": 2445 + }, + { + "epoch": 0.31, + "grad_norm": 2.1703217029571533, + "learning_rate": 8.038728777023858e-06, + "loss": 0.5941, + "step": 2446 + }, + { + "epoch": 0.31, + "grad_norm": 4.146225452423096, + "learning_rate": 8.037080669819723e-06, + "loss": 0.6693, + "step": 2447 + }, + { + "epoch": 0.31, + "grad_norm": 1.3640509843826294, + "learning_rate": 8.035432039540172e-06, + "loss": 0.7157, + "step": 2448 + }, + { + "epoch": 0.31, + "grad_norm": 1.1412841081619263, + "learning_rate": 8.03378288646915e-06, + "loss": 0.4853, + "step": 2449 + }, + { + "epoch": 0.31, + "grad_norm": 1.9124466180801392, + "learning_rate": 8.03213321089069e-06, + "loss": 0.5493, + "step": 2450 + }, + { + "epoch": 0.31, + "grad_norm": 1.4567307233810425, + "learning_rate": 8.030483013088913e-06, + "loss": 0.6635, + "step": 2451 + }, + { + "epoch": 0.31, + "grad_norm": 1.292675495147705, + "learning_rate": 8.028832293348036e-06, + "loss": 0.7629, + "step": 2452 + }, + { + "epoch": 0.31, + "grad_norm": 1.2411820888519287, + "learning_rate": 8.027181051952359e-06, + "loss": 0.6375, + "step": 2453 + }, + { + "epoch": 0.31, + "grad_norm": 1.1143420934677124, + "learning_rate": 8.025529289186276e-06, + "loss": 0.5945, + "step": 2454 + }, + { + "epoch": 0.31, + "grad_norm": 1.4121794700622559, + "learning_rate": 8.023877005334268e-06, + "loss": 0.5983, + "step": 2455 + }, + { + "epoch": 0.31, + "grad_norm": 1.5333738327026367, + "learning_rate": 8.022224200680911e-06, + "loss": 0.5918, + "step": 2456 + }, + { + "epoch": 0.31, + "grad_norm": 1.6830614805221558, + "learning_rate": 8.020570875510863e-06, + "loss": 0.5903, + "step": 2457 + }, + { + "epoch": 0.31, + "grad_norm": 1.3144832849502563, + "learning_rate": 8.018917030108874e-06, + "loss": 0.665, + "step": 2458 + }, + { + "epoch": 0.32, + "grad_norm": 3.2713377475738525, + "learning_rate": 8.017262664759793e-06, + "loss": 0.6369, + "step": 2459 + }, + { + "epoch": 0.32, + "grad_norm": 1.5499030351638794, + "learning_rate": 8.015607779748544e-06, + "loss": 0.5759, + "step": 2460 + }, + { + "epoch": 0.32, + "grad_norm": 1.8569972515106201, + "learning_rate": 8.01395237536015e-06, + "loss": 0.6703, + "step": 2461 + }, + { + "epoch": 0.32, + "grad_norm": 1.4563158750534058, + "learning_rate": 8.01229645187972e-06, + "loss": 0.6818, + "step": 2462 + }, + { + "epoch": 0.32, + "grad_norm": 1.1788994073867798, + "learning_rate": 8.010640009592454e-06, + "loss": 0.6173, + "step": 2463 + }, + { + "epoch": 0.32, + "grad_norm": 1.48537015914917, + "learning_rate": 8.008983048783639e-06, + "loss": 0.5923, + "step": 2464 + }, + { + "epoch": 0.32, + "grad_norm": 1.9768660068511963, + "learning_rate": 8.007325569738654e-06, + "loss": 0.6231, + "step": 2465 + }, + { + "epoch": 0.32, + "grad_norm": 1.359007716178894, + "learning_rate": 8.005667572742964e-06, + "loss": 0.6858, + "step": 2466 + }, + { + "epoch": 0.32, + "grad_norm": 1.6291216611862183, + "learning_rate": 8.00400905808213e-06, + "loss": 0.6691, + "step": 2467 + }, + { + "epoch": 0.32, + "grad_norm": 1.2007826566696167, + "learning_rate": 8.002350026041792e-06, + "loss": 0.6349, + "step": 2468 + }, + { + "epoch": 0.32, + "grad_norm": 1.9453998804092407, + "learning_rate": 8.000690476907688e-06, + "loss": 0.5912, + "step": 2469 + }, + { + "epoch": 0.32, + "grad_norm": 1.2745121717453003, + "learning_rate": 7.999030410965642e-06, + "loss": 0.6899, + "step": 2470 + }, + { + "epoch": 0.32, + "grad_norm": 1.5417898893356323, + "learning_rate": 7.997369828501565e-06, + "loss": 0.5747, + "step": 2471 + }, + { + "epoch": 0.32, + "grad_norm": 1.322919249534607, + "learning_rate": 7.995708729801459e-06, + "loss": 0.5912, + "step": 2472 + }, + { + "epoch": 0.32, + "grad_norm": 2.1955883502960205, + "learning_rate": 7.994047115151414e-06, + "loss": 0.5515, + "step": 2473 + }, + { + "epoch": 0.32, + "grad_norm": 1.2016857862472534, + "learning_rate": 7.992384984837608e-06, + "loss": 0.5988, + "step": 2474 + }, + { + "epoch": 0.32, + "grad_norm": 1.1391526460647583, + "learning_rate": 7.99072233914631e-06, + "loss": 0.7175, + "step": 2475 + }, + { + "epoch": 0.32, + "grad_norm": 1.383262276649475, + "learning_rate": 7.98905917836388e-06, + "loss": 0.5499, + "step": 2476 + }, + { + "epoch": 0.32, + "grad_norm": 1.3291974067687988, + "learning_rate": 7.987395502776762e-06, + "loss": 0.5684, + "step": 2477 + }, + { + "epoch": 0.32, + "grad_norm": 1.3303680419921875, + "learning_rate": 7.98573131267149e-06, + "loss": 0.6121, + "step": 2478 + }, + { + "epoch": 0.32, + "grad_norm": 1.3416320085525513, + "learning_rate": 7.984066608334684e-06, + "loss": 0.58, + "step": 2479 + }, + { + "epoch": 0.32, + "grad_norm": 1.2517801523208618, + "learning_rate": 7.982401390053061e-06, + "loss": 0.6141, + "step": 2480 + }, + { + "epoch": 0.32, + "grad_norm": 1.685667634010315, + "learning_rate": 7.980735658113416e-06, + "loss": 0.5259, + "step": 2481 + }, + { + "epoch": 0.32, + "grad_norm": 1.3155723810195923, + "learning_rate": 7.97906941280264e-06, + "loss": 0.5923, + "step": 2482 + }, + { + "epoch": 0.32, + "grad_norm": 1.6275346279144287, + "learning_rate": 7.97740265440771e-06, + "loss": 0.6981, + "step": 2483 + }, + { + "epoch": 0.32, + "grad_norm": 1.539910078048706, + "learning_rate": 7.975735383215691e-06, + "loss": 0.7108, + "step": 2484 + }, + { + "epoch": 0.32, + "grad_norm": 1.0558339357376099, + "learning_rate": 7.974067599513737e-06, + "loss": 0.6255, + "step": 2485 + }, + { + "epoch": 0.32, + "grad_norm": 1.4272794723510742, + "learning_rate": 7.972399303589087e-06, + "loss": 0.646, + "step": 2486 + }, + { + "epoch": 0.32, + "grad_norm": 1.5255314111709595, + "learning_rate": 7.970730495729075e-06, + "loss": 0.6481, + "step": 2487 + }, + { + "epoch": 0.32, + "grad_norm": 1.7075724601745605, + "learning_rate": 7.969061176221118e-06, + "loss": 0.6343, + "step": 2488 + }, + { + "epoch": 0.32, + "grad_norm": 1.2317755222320557, + "learning_rate": 7.96739134535272e-06, + "loss": 0.7467, + "step": 2489 + }, + { + "epoch": 0.32, + "grad_norm": 1.4913558959960938, + "learning_rate": 7.965721003411477e-06, + "loss": 0.6367, + "step": 2490 + }, + { + "epoch": 0.32, + "grad_norm": 1.5637904405593872, + "learning_rate": 7.964050150685075e-06, + "loss": 0.6279, + "step": 2491 + }, + { + "epoch": 0.32, + "grad_norm": 1.2378253936767578, + "learning_rate": 7.962378787461278e-06, + "loss": 0.6074, + "step": 2492 + }, + { + "epoch": 0.32, + "grad_norm": 1.2458785772323608, + "learning_rate": 7.960706914027947e-06, + "loss": 0.6126, + "step": 2493 + }, + { + "epoch": 0.32, + "grad_norm": 1.1778990030288696, + "learning_rate": 7.95903453067303e-06, + "loss": 0.6499, + "step": 2494 + }, + { + "epoch": 0.32, + "grad_norm": 1.3699263334274292, + "learning_rate": 7.95736163768456e-06, + "loss": 0.6635, + "step": 2495 + }, + { + "epoch": 0.32, + "grad_norm": 1.1903271675109863, + "learning_rate": 7.955688235350659e-06, + "loss": 0.6467, + "step": 2496 + }, + { + "epoch": 0.32, + "grad_norm": 0.9940861463546753, + "learning_rate": 7.954014323959535e-06, + "loss": 0.5876, + "step": 2497 + }, + { + "epoch": 0.32, + "grad_norm": 1.2723509073257446, + "learning_rate": 7.952339903799486e-06, + "loss": 0.6072, + "step": 2498 + }, + { + "epoch": 0.32, + "grad_norm": 1.031148076057434, + "learning_rate": 7.950664975158898e-06, + "loss": 0.6026, + "step": 2499 + }, + { + "epoch": 0.32, + "grad_norm": 1.7933772802352905, + "learning_rate": 7.948989538326241e-06, + "loss": 0.7027, + "step": 2500 + }, + { + "epoch": 0.32, + "grad_norm": 1.2486510276794434, + "learning_rate": 7.947313593590078e-06, + "loss": 0.5948, + "step": 2501 + }, + { + "epoch": 0.32, + "grad_norm": 1.5051732063293457, + "learning_rate": 7.945637141239054e-06, + "loss": 0.6531, + "step": 2502 + }, + { + "epoch": 0.32, + "grad_norm": 1.544805884361267, + "learning_rate": 7.943960181561905e-06, + "loss": 0.6543, + "step": 2503 + }, + { + "epoch": 0.32, + "grad_norm": 1.2853175401687622, + "learning_rate": 7.942282714847453e-06, + "loss": 0.6587, + "step": 2504 + }, + { + "epoch": 0.32, + "grad_norm": 1.3751949071884155, + "learning_rate": 7.940604741384607e-06, + "loss": 0.5903, + "step": 2505 + }, + { + "epoch": 0.32, + "grad_norm": 1.3573917150497437, + "learning_rate": 7.938926261462366e-06, + "loss": 0.6358, + "step": 2506 + }, + { + "epoch": 0.32, + "grad_norm": 0.9741413593292236, + "learning_rate": 7.937247275369813e-06, + "loss": 0.61, + "step": 2507 + }, + { + "epoch": 0.32, + "grad_norm": 1.3532620668411255, + "learning_rate": 7.935567783396116e-06, + "loss": 0.5756, + "step": 2508 + }, + { + "epoch": 0.32, + "grad_norm": 1.2103369235992432, + "learning_rate": 7.933887785830536e-06, + "loss": 0.6417, + "step": 2509 + }, + { + "epoch": 0.32, + "grad_norm": 1.3352247476577759, + "learning_rate": 7.93220728296242e-06, + "loss": 0.6468, + "step": 2510 + }, + { + "epoch": 0.32, + "grad_norm": 1.1334822177886963, + "learning_rate": 7.930526275081198e-06, + "loss": 0.629, + "step": 2511 + }, + { + "epoch": 0.32, + "grad_norm": 1.796984314918518, + "learning_rate": 7.92884476247639e-06, + "loss": 0.637, + "step": 2512 + }, + { + "epoch": 0.32, + "grad_norm": 1.3944237232208252, + "learning_rate": 7.927162745437605e-06, + "loss": 0.602, + "step": 2513 + }, + { + "epoch": 0.32, + "grad_norm": 1.1634482145309448, + "learning_rate": 7.92548022425453e-06, + "loss": 0.7354, + "step": 2514 + }, + { + "epoch": 0.32, + "grad_norm": 1.0452791452407837, + "learning_rate": 7.923797199216952e-06, + "loss": 0.5221, + "step": 2515 + }, + { + "epoch": 0.32, + "grad_norm": 1.141053557395935, + "learning_rate": 7.922113670614733e-06, + "loss": 0.6103, + "step": 2516 + }, + { + "epoch": 0.32, + "grad_norm": 1.371309518814087, + "learning_rate": 7.92042963873783e-06, + "loss": 0.5625, + "step": 2517 + }, + { + "epoch": 0.32, + "grad_norm": 1.4770622253417969, + "learning_rate": 7.91874510387628e-06, + "loss": 0.6978, + "step": 2518 + }, + { + "epoch": 0.32, + "grad_norm": 1.2017892599105835, + "learning_rate": 7.917060066320213e-06, + "loss": 0.5845, + "step": 2519 + }, + { + "epoch": 0.32, + "grad_norm": 1.2020307779312134, + "learning_rate": 7.91537452635984e-06, + "loss": 0.5964, + "step": 2520 + }, + { + "epoch": 0.32, + "grad_norm": 1.0251448154449463, + "learning_rate": 7.913688484285462e-06, + "loss": 0.6008, + "step": 2521 + }, + { + "epoch": 0.32, + "grad_norm": 1.8153685331344604, + "learning_rate": 7.912001940387466e-06, + "loss": 0.736, + "step": 2522 + }, + { + "epoch": 0.32, + "grad_norm": 2.3994553089141846, + "learning_rate": 7.910314894956326e-06, + "loss": 0.6798, + "step": 2523 + }, + { + "epoch": 0.32, + "grad_norm": 1.541852593421936, + "learning_rate": 7.908627348282599e-06, + "loss": 0.6723, + "step": 2524 + }, + { + "epoch": 0.32, + "grad_norm": 1.0899004936218262, + "learning_rate": 7.906939300656929e-06, + "loss": 0.6825, + "step": 2525 + }, + { + "epoch": 0.32, + "grad_norm": 1.5158523321151733, + "learning_rate": 7.905250752370051e-06, + "loss": 0.5933, + "step": 2526 + }, + { + "epoch": 0.32, + "grad_norm": 1.0821444988250732, + "learning_rate": 7.903561703712784e-06, + "loss": 0.5961, + "step": 2527 + }, + { + "epoch": 0.32, + "grad_norm": 1.3640822172164917, + "learning_rate": 7.90187215497603e-06, + "loss": 0.6481, + "step": 2528 + }, + { + "epoch": 0.32, + "grad_norm": 1.0577113628387451, + "learning_rate": 7.900182106450778e-06, + "loss": 0.6615, + "step": 2529 + }, + { + "epoch": 0.32, + "grad_norm": 1.1909829378128052, + "learning_rate": 7.898491558428108e-06, + "loss": 0.6597, + "step": 2530 + }, + { + "epoch": 0.32, + "grad_norm": 1.1416188478469849, + "learning_rate": 7.896800511199182e-06, + "loss": 0.7269, + "step": 2531 + }, + { + "epoch": 0.32, + "grad_norm": 1.210830807685852, + "learning_rate": 7.895108965055247e-06, + "loss": 0.6678, + "step": 2532 + }, + { + "epoch": 0.32, + "grad_norm": 0.9497708678245544, + "learning_rate": 7.893416920287638e-06, + "loss": 0.5491, + "step": 2533 + }, + { + "epoch": 0.32, + "grad_norm": 1.2646074295043945, + "learning_rate": 7.891724377187774e-06, + "loss": 0.5835, + "step": 2534 + }, + { + "epoch": 0.32, + "grad_norm": 1.4362199306488037, + "learning_rate": 7.890031336047166e-06, + "loss": 0.6361, + "step": 2535 + }, + { + "epoch": 0.32, + "grad_norm": 1.1478391885757446, + "learning_rate": 7.8883377971574e-06, + "loss": 0.6115, + "step": 2536 + }, + { + "epoch": 0.33, + "grad_norm": 1.1613198518753052, + "learning_rate": 7.886643760810156e-06, + "loss": 0.6171, + "step": 2537 + }, + { + "epoch": 0.33, + "grad_norm": 1.247196912765503, + "learning_rate": 7.884949227297199e-06, + "loss": 0.6298, + "step": 2538 + }, + { + "epoch": 0.33, + "grad_norm": 1.3772872686386108, + "learning_rate": 7.883254196910375e-06, + "loss": 0.5992, + "step": 2539 + }, + { + "epoch": 0.33, + "grad_norm": 1.6356278657913208, + "learning_rate": 7.88155866994162e-06, + "loss": 0.6667, + "step": 2540 + }, + { + "epoch": 0.33, + "grad_norm": 1.2067662477493286, + "learning_rate": 7.879862646682955e-06, + "loss": 0.6492, + "step": 2541 + }, + { + "epoch": 0.33, + "grad_norm": 1.1001278162002563, + "learning_rate": 7.878166127426483e-06, + "loss": 0.6088, + "step": 2542 + }, + { + "epoch": 0.33, + "grad_norm": 0.9857989549636841, + "learning_rate": 7.876469112464395e-06, + "loss": 0.6095, + "step": 2543 + }, + { + "epoch": 0.33, + "grad_norm": 1.3550429344177246, + "learning_rate": 7.874771602088971e-06, + "loss": 0.6157, + "step": 2544 + }, + { + "epoch": 0.33, + "grad_norm": 0.9245561361312866, + "learning_rate": 7.873073596592571e-06, + "loss": 0.5456, + "step": 2545 + }, + { + "epoch": 0.33, + "grad_norm": 1.329111099243164, + "learning_rate": 7.871375096267641e-06, + "loss": 0.6259, + "step": 2546 + }, + { + "epoch": 0.33, + "grad_norm": 1.0523360967636108, + "learning_rate": 7.869676101406713e-06, + "loss": 0.6267, + "step": 2547 + }, + { + "epoch": 0.33, + "grad_norm": 1.33540940284729, + "learning_rate": 7.867976612302405e-06, + "loss": 0.6859, + "step": 2548 + }, + { + "epoch": 0.33, + "grad_norm": 1.6070891618728638, + "learning_rate": 7.86627662924742e-06, + "loss": 0.6134, + "step": 2549 + }, + { + "epoch": 0.33, + "grad_norm": 1.088706374168396, + "learning_rate": 7.864576152534544e-06, + "loss": 0.6603, + "step": 2550 + }, + { + "epoch": 0.33, + "grad_norm": 1.3226513862609863, + "learning_rate": 7.862875182456652e-06, + "loss": 0.5793, + "step": 2551 + }, + { + "epoch": 0.33, + "grad_norm": 1.0518193244934082, + "learning_rate": 7.861173719306697e-06, + "loss": 0.5619, + "step": 2552 + }, + { + "epoch": 0.33, + "grad_norm": 1.0934220552444458, + "learning_rate": 7.859471763377726e-06, + "loss": 0.6339, + "step": 2553 + }, + { + "epoch": 0.33, + "grad_norm": 1.7915741205215454, + "learning_rate": 7.857769314962865e-06, + "loss": 0.6733, + "step": 2554 + }, + { + "epoch": 0.33, + "grad_norm": 1.3046362400054932, + "learning_rate": 7.856066374355326e-06, + "loss": 0.6231, + "step": 2555 + }, + { + "epoch": 0.33, + "grad_norm": 1.4739090204238892, + "learning_rate": 7.854362941848406e-06, + "loss": 0.6874, + "step": 2556 + }, + { + "epoch": 0.33, + "grad_norm": 1.2384624481201172, + "learning_rate": 7.852659017735484e-06, + "loss": 0.642, + "step": 2557 + }, + { + "epoch": 0.33, + "grad_norm": 0.99631267786026, + "learning_rate": 7.850954602310032e-06, + "loss": 0.5691, + "step": 2558 + }, + { + "epoch": 0.33, + "grad_norm": 1.1139394044876099, + "learning_rate": 7.849249695865595e-06, + "loss": 0.6083, + "step": 2559 + }, + { + "epoch": 0.33, + "grad_norm": 1.2851378917694092, + "learning_rate": 7.847544298695812e-06, + "loss": 0.7062, + "step": 2560 + }, + { + "epoch": 0.33, + "grad_norm": 1.2008907794952393, + "learning_rate": 7.845838411094403e-06, + "loss": 0.5856, + "step": 2561 + }, + { + "epoch": 0.33, + "grad_norm": 1.2814170122146606, + "learning_rate": 7.844132033355169e-06, + "loss": 0.6046, + "step": 2562 + }, + { + "epoch": 0.33, + "grad_norm": 1.5419236421585083, + "learning_rate": 7.842425165772003e-06, + "loss": 0.6394, + "step": 2563 + }, + { + "epoch": 0.33, + "grad_norm": 1.3178516626358032, + "learning_rate": 7.840717808638878e-06, + "loss": 0.5834, + "step": 2564 + }, + { + "epoch": 0.33, + "grad_norm": 1.137864112854004, + "learning_rate": 7.839009962249849e-06, + "loss": 0.7019, + "step": 2565 + }, + { + "epoch": 0.33, + "grad_norm": 1.5136849880218506, + "learning_rate": 7.837301626899059e-06, + "loss": 0.6362, + "step": 2566 + }, + { + "epoch": 0.33, + "grad_norm": 1.5012842416763306, + "learning_rate": 7.835592802880733e-06, + "loss": 0.6163, + "step": 2567 + }, + { + "epoch": 0.33, + "grad_norm": 1.2922011613845825, + "learning_rate": 7.833883490489183e-06, + "loss": 0.6593, + "step": 2568 + }, + { + "epoch": 0.33, + "grad_norm": 1.0815269947052002, + "learning_rate": 7.832173690018803e-06, + "loss": 0.641, + "step": 2569 + }, + { + "epoch": 0.33, + "grad_norm": 1.347912073135376, + "learning_rate": 7.83046340176407e-06, + "loss": 0.5536, + "step": 2570 + }, + { + "epoch": 0.33, + "grad_norm": 1.4981647729873657, + "learning_rate": 7.82875262601955e-06, + "loss": 0.7163, + "step": 2571 + }, + { + "epoch": 0.33, + "grad_norm": 1.0309983491897583, + "learning_rate": 7.827041363079884e-06, + "loss": 0.6477, + "step": 2572 + }, + { + "epoch": 0.33, + "grad_norm": 1.9935951232910156, + "learning_rate": 7.825329613239804e-06, + "loss": 0.663, + "step": 2573 + }, + { + "epoch": 0.33, + "grad_norm": 1.2904274463653564, + "learning_rate": 7.823617376794128e-06, + "loss": 0.5916, + "step": 2574 + }, + { + "epoch": 0.33, + "grad_norm": 1.2341314554214478, + "learning_rate": 7.82190465403775e-06, + "loss": 0.6036, + "step": 2575 + }, + { + "epoch": 0.33, + "grad_norm": 1.3546459674835205, + "learning_rate": 7.820191445265653e-06, + "loss": 0.6393, + "step": 2576 + }, + { + "epoch": 0.33, + "grad_norm": 1.1507740020751953, + "learning_rate": 7.818477750772901e-06, + "loss": 0.6983, + "step": 2577 + }, + { + "epoch": 0.33, + "grad_norm": 3.3480865955352783, + "learning_rate": 7.816763570854644e-06, + "loss": 0.6082, + "step": 2578 + }, + { + "epoch": 0.33, + "grad_norm": 1.4413642883300781, + "learning_rate": 7.815048905806116e-06, + "loss": 0.6031, + "step": 2579 + }, + { + "epoch": 0.33, + "grad_norm": 1.2350773811340332, + "learning_rate": 7.813333755922631e-06, + "loss": 0.6398, + "step": 2580 + }, + { + "epoch": 0.33, + "grad_norm": 2.1799776554107666, + "learning_rate": 7.811618121499591e-06, + "loss": 0.6729, + "step": 2581 + }, + { + "epoch": 0.33, + "grad_norm": 1.3736685514450073, + "learning_rate": 7.809902002832477e-06, + "loss": 0.6516, + "step": 2582 + }, + { + "epoch": 0.33, + "grad_norm": 1.426729679107666, + "learning_rate": 7.808185400216857e-06, + "loss": 0.6552, + "step": 2583 + }, + { + "epoch": 0.33, + "grad_norm": 1.6423767805099487, + "learning_rate": 7.806468313948379e-06, + "loss": 0.6658, + "step": 2584 + }, + { + "epoch": 0.33, + "grad_norm": 1.1850903034210205, + "learning_rate": 7.804750744322777e-06, + "loss": 0.4809, + "step": 2585 + }, + { + "epoch": 0.33, + "grad_norm": 1.1379947662353516, + "learning_rate": 7.80303269163587e-06, + "loss": 0.5726, + "step": 2586 + }, + { + "epoch": 0.33, + "grad_norm": 1.541176438331604, + "learning_rate": 7.801314156183554e-06, + "loss": 0.5963, + "step": 2587 + }, + { + "epoch": 0.33, + "grad_norm": 1.38937509059906, + "learning_rate": 7.799595138261815e-06, + "loss": 0.5938, + "step": 2588 + }, + { + "epoch": 0.33, + "grad_norm": 1.0433942079544067, + "learning_rate": 7.797875638166717e-06, + "loss": 0.7079, + "step": 2589 + }, + { + "epoch": 0.33, + "grad_norm": 1.3225352764129639, + "learning_rate": 7.796155656194409e-06, + "loss": 0.6547, + "step": 2590 + }, + { + "epoch": 0.33, + "grad_norm": 1.3787798881530762, + "learning_rate": 7.794435192641124e-06, + "loss": 0.6904, + "step": 2591 + }, + { + "epoch": 0.33, + "grad_norm": 1.0460058450698853, + "learning_rate": 7.792714247803174e-06, + "loss": 0.5865, + "step": 2592 + }, + { + "epoch": 0.33, + "grad_norm": 1.1611684560775757, + "learning_rate": 7.79099282197696e-06, + "loss": 0.7466, + "step": 2593 + }, + { + "epoch": 0.33, + "grad_norm": 1.1869133710861206, + "learning_rate": 7.789270915458962e-06, + "loss": 0.6646, + "step": 2594 + }, + { + "epoch": 0.33, + "grad_norm": 1.3539026975631714, + "learning_rate": 7.787548528545743e-06, + "loss": 0.5528, + "step": 2595 + }, + { + "epoch": 0.33, + "grad_norm": 1.1651065349578857, + "learning_rate": 7.785825661533947e-06, + "loss": 0.6363, + "step": 2596 + }, + { + "epoch": 0.33, + "grad_norm": 1.2617387771606445, + "learning_rate": 7.784102314720307e-06, + "loss": 0.669, + "step": 2597 + }, + { + "epoch": 0.33, + "grad_norm": 1.5752850770950317, + "learning_rate": 7.782378488401632e-06, + "loss": 0.5657, + "step": 2598 + }, + { + "epoch": 0.33, + "grad_norm": 1.5625871419906616, + "learning_rate": 7.780654182874816e-06, + "loss": 0.6113, + "step": 2599 + }, + { + "epoch": 0.33, + "grad_norm": 1.0509943962097168, + "learning_rate": 7.778929398436835e-06, + "loss": 0.6342, + "step": 2600 + }, + { + "epoch": 0.33, + "grad_norm": 1.2715318202972412, + "learning_rate": 7.777204135384749e-06, + "loss": 0.5642, + "step": 2601 + }, + { + "epoch": 0.33, + "grad_norm": 1.5984022617340088, + "learning_rate": 7.7754783940157e-06, + "loss": 0.6316, + "step": 2602 + }, + { + "epoch": 0.33, + "grad_norm": 1.0083099603652954, + "learning_rate": 7.773752174626911e-06, + "loss": 0.639, + "step": 2603 + }, + { + "epoch": 0.33, + "grad_norm": 1.0958874225616455, + "learning_rate": 7.77202547751569e-06, + "loss": 0.6832, + "step": 2604 + }, + { + "epoch": 0.33, + "grad_norm": 1.1940189599990845, + "learning_rate": 7.770298302979422e-06, + "loss": 0.6686, + "step": 2605 + }, + { + "epoch": 0.33, + "grad_norm": 1.1370244026184082, + "learning_rate": 7.768570651315582e-06, + "loss": 0.5631, + "step": 2606 + }, + { + "epoch": 0.33, + "grad_norm": 1.4601707458496094, + "learning_rate": 7.766842522821718e-06, + "loss": 0.6766, + "step": 2607 + }, + { + "epoch": 0.33, + "grad_norm": 0.9551748633384705, + "learning_rate": 7.76511391779547e-06, + "loss": 0.5999, + "step": 2608 + }, + { + "epoch": 0.33, + "grad_norm": 1.3081096410751343, + "learning_rate": 7.76338483653455e-06, + "loss": 0.5324, + "step": 2609 + }, + { + "epoch": 0.33, + "grad_norm": 1.218382716178894, + "learning_rate": 7.761655279336762e-06, + "loss": 0.6574, + "step": 2610 + }, + { + "epoch": 0.33, + "grad_norm": 1.2414915561676025, + "learning_rate": 7.759925246499984e-06, + "loss": 0.6467, + "step": 2611 + }, + { + "epoch": 0.33, + "grad_norm": 1.1774640083312988, + "learning_rate": 7.75819473832218e-06, + "loss": 0.6357, + "step": 2612 + }, + { + "epoch": 0.33, + "grad_norm": 1.1158899068832397, + "learning_rate": 7.756463755101395e-06, + "loss": 0.5574, + "step": 2613 + }, + { + "epoch": 0.33, + "grad_norm": 1.0917646884918213, + "learning_rate": 7.754732297135756e-06, + "loss": 0.61, + "step": 2614 + }, + { + "epoch": 0.34, + "grad_norm": 1.4236118793487549, + "learning_rate": 7.753000364723471e-06, + "loss": 0.6406, + "step": 2615 + }, + { + "epoch": 0.34, + "grad_norm": 1.2008647918701172, + "learning_rate": 7.75126795816283e-06, + "loss": 0.6095, + "step": 2616 + }, + { + "epoch": 0.34, + "grad_norm": 2.6084930896759033, + "learning_rate": 7.749535077752204e-06, + "loss": 0.5763, + "step": 2617 + }, + { + "epoch": 0.34, + "grad_norm": 1.2119183540344238, + "learning_rate": 7.747801723790046e-06, + "loss": 0.5449, + "step": 2618 + }, + { + "epoch": 0.34, + "grad_norm": 1.2325401306152344, + "learning_rate": 7.746067896574893e-06, + "loss": 0.7414, + "step": 2619 + }, + { + "epoch": 0.34, + "grad_norm": 1.1843703985214233, + "learning_rate": 7.744333596405363e-06, + "loss": 0.6557, + "step": 2620 + }, + { + "epoch": 0.34, + "grad_norm": 1.0715175867080688, + "learning_rate": 7.742598823580149e-06, + "loss": 0.63, + "step": 2621 + }, + { + "epoch": 0.34, + "grad_norm": 2.040466070175171, + "learning_rate": 7.740863578398033e-06, + "loss": 0.5549, + "step": 2622 + }, + { + "epoch": 0.34, + "grad_norm": 1.321418285369873, + "learning_rate": 7.739127861157878e-06, + "loss": 0.6138, + "step": 2623 + }, + { + "epoch": 0.34, + "grad_norm": 1.3999861478805542, + "learning_rate": 7.737391672158621e-06, + "loss": 0.5579, + "step": 2624 + }, + { + "epoch": 0.34, + "grad_norm": 1.2321044206619263, + "learning_rate": 7.73565501169929e-06, + "loss": 0.6219, + "step": 2625 + }, + { + "epoch": 0.34, + "grad_norm": 1.1841264963150024, + "learning_rate": 7.733917880078988e-06, + "loss": 0.5434, + "step": 2626 + }, + { + "epoch": 0.34, + "grad_norm": 1.1282161474227905, + "learning_rate": 7.732180277596899e-06, + "loss": 0.5728, + "step": 2627 + }, + { + "epoch": 0.34, + "grad_norm": 1.2175527811050415, + "learning_rate": 7.730442204552292e-06, + "loss": 0.6357, + "step": 2628 + }, + { + "epoch": 0.34, + "grad_norm": 1.1946020126342773, + "learning_rate": 7.72870366124451e-06, + "loss": 0.6907, + "step": 2629 + }, + { + "epoch": 0.34, + "grad_norm": 1.3675916194915771, + "learning_rate": 7.726964647972987e-06, + "loss": 0.7276, + "step": 2630 + }, + { + "epoch": 0.34, + "grad_norm": 1.191794753074646, + "learning_rate": 7.725225165037233e-06, + "loss": 0.6401, + "step": 2631 + }, + { + "epoch": 0.34, + "grad_norm": 1.3735675811767578, + "learning_rate": 7.723485212736835e-06, + "loss": 0.7791, + "step": 2632 + }, + { + "epoch": 0.34, + "grad_norm": 1.5447452068328857, + "learning_rate": 7.721744791371466e-06, + "loss": 0.7008, + "step": 2633 + }, + { + "epoch": 0.34, + "grad_norm": 2.7708098888397217, + "learning_rate": 7.72000390124088e-06, + "loss": 0.7301, + "step": 2634 + }, + { + "epoch": 0.34, + "grad_norm": 1.259595513343811, + "learning_rate": 7.718262542644906e-06, + "loss": 0.6117, + "step": 2635 + }, + { + "epoch": 0.34, + "grad_norm": 1.14170503616333, + "learning_rate": 7.716520715883463e-06, + "loss": 0.6366, + "step": 2636 + }, + { + "epoch": 0.34, + "grad_norm": 1.3256770372390747, + "learning_rate": 7.714778421256538e-06, + "loss": 0.6714, + "step": 2637 + }, + { + "epoch": 0.34, + "grad_norm": 1.6231776475906372, + "learning_rate": 7.71303565906421e-06, + "loss": 0.5758, + "step": 2638 + }, + { + "epoch": 0.34, + "grad_norm": 1.823754906654358, + "learning_rate": 7.711292429606635e-06, + "loss": 0.6082, + "step": 2639 + }, + { + "epoch": 0.34, + "grad_norm": 1.233652114868164, + "learning_rate": 7.70954873318405e-06, + "loss": 0.613, + "step": 2640 + }, + { + "epoch": 0.34, + "grad_norm": 1.0892643928527832, + "learning_rate": 7.707804570096769e-06, + "loss": 0.5392, + "step": 2641 + }, + { + "epoch": 0.34, + "grad_norm": 1.5985088348388672, + "learning_rate": 7.706059940645187e-06, + "loss": 0.6543, + "step": 2642 + }, + { + "epoch": 0.34, + "grad_norm": 1.1690950393676758, + "learning_rate": 7.704314845129785e-06, + "loss": 0.6247, + "step": 2643 + }, + { + "epoch": 0.34, + "grad_norm": 1.276746392250061, + "learning_rate": 7.702569283851117e-06, + "loss": 0.6876, + "step": 2644 + }, + { + "epoch": 0.34, + "grad_norm": 1.1838765144348145, + "learning_rate": 7.700823257109821e-06, + "loss": 0.5706, + "step": 2645 + }, + { + "epoch": 0.34, + "grad_norm": 1.1753582954406738, + "learning_rate": 7.699076765206617e-06, + "loss": 0.7981, + "step": 2646 + }, + { + "epoch": 0.34, + "grad_norm": 1.190260410308838, + "learning_rate": 7.6973298084423e-06, + "loss": 0.6326, + "step": 2647 + }, + { + "epoch": 0.34, + "grad_norm": 1.8324419260025024, + "learning_rate": 7.695582387117749e-06, + "loss": 0.678, + "step": 2648 + }, + { + "epoch": 0.34, + "grad_norm": 1.1149442195892334, + "learning_rate": 7.693834501533921e-06, + "loss": 0.6484, + "step": 2649 + }, + { + "epoch": 0.34, + "grad_norm": 1.1948319673538208, + "learning_rate": 7.692086151991855e-06, + "loss": 0.5729, + "step": 2650 + }, + { + "epoch": 0.34, + "grad_norm": 1.7144792079925537, + "learning_rate": 7.690337338792667e-06, + "loss": 0.683, + "step": 2651 + }, + { + "epoch": 0.34, + "grad_norm": 1.9441317319869995, + "learning_rate": 7.688588062237557e-06, + "loss": 0.6081, + "step": 2652 + }, + { + "epoch": 0.34, + "grad_norm": 1.2331782579421997, + "learning_rate": 7.6868383226278e-06, + "loss": 0.6088, + "step": 2653 + }, + { + "epoch": 0.34, + "grad_norm": 1.6664066314697266, + "learning_rate": 7.685088120264754e-06, + "loss": 0.6171, + "step": 2654 + }, + { + "epoch": 0.34, + "grad_norm": 1.3156743049621582, + "learning_rate": 7.683337455449856e-06, + "loss": 0.6272, + "step": 2655 + }, + { + "epoch": 0.34, + "grad_norm": 1.1555067300796509, + "learning_rate": 7.681586328484621e-06, + "loss": 0.5953, + "step": 2656 + }, + { + "epoch": 0.34, + "grad_norm": 1.3859845399856567, + "learning_rate": 7.679834739670649e-06, + "loss": 0.634, + "step": 2657 + }, + { + "epoch": 0.34, + "grad_norm": 2.6940677165985107, + "learning_rate": 7.67808268930961e-06, + "loss": 0.6804, + "step": 2658 + }, + { + "epoch": 0.34, + "grad_norm": 1.2090493440628052, + "learning_rate": 7.676330177703262e-06, + "loss": 0.6196, + "step": 2659 + }, + { + "epoch": 0.34, + "grad_norm": 1.1433385610580444, + "learning_rate": 7.674577205153441e-06, + "loss": 0.6498, + "step": 2660 + }, + { + "epoch": 0.34, + "grad_norm": 1.3377647399902344, + "learning_rate": 7.672823771962059e-06, + "loss": 0.5986, + "step": 2661 + }, + { + "epoch": 0.34, + "grad_norm": 1.0680303573608398, + "learning_rate": 7.671069878431107e-06, + "loss": 0.5319, + "step": 2662 + }, + { + "epoch": 0.34, + "grad_norm": 1.3417402505874634, + "learning_rate": 7.669315524862662e-06, + "loss": 0.6823, + "step": 2663 + }, + { + "epoch": 0.34, + "grad_norm": 1.365355372428894, + "learning_rate": 7.667560711558875e-06, + "loss": 0.6342, + "step": 2664 + }, + { + "epoch": 0.34, + "grad_norm": 1.3211275339126587, + "learning_rate": 7.665805438821973e-06, + "loss": 0.6097, + "step": 2665 + }, + { + "epoch": 0.34, + "grad_norm": 1.580230474472046, + "learning_rate": 7.664049706954271e-06, + "loss": 0.6559, + "step": 2666 + }, + { + "epoch": 0.34, + "grad_norm": 1.0732578039169312, + "learning_rate": 7.662293516258154e-06, + "loss": 0.5693, + "step": 2667 + }, + { + "epoch": 0.34, + "grad_norm": 1.8319809436798096, + "learning_rate": 7.660536867036092e-06, + "loss": 0.6538, + "step": 2668 + }, + { + "epoch": 0.34, + "grad_norm": 1.520002007484436, + "learning_rate": 7.658779759590634e-06, + "loss": 0.5934, + "step": 2669 + }, + { + "epoch": 0.34, + "grad_norm": 1.2555063962936401, + "learning_rate": 7.657022194224402e-06, + "loss": 0.6547, + "step": 2670 + }, + { + "epoch": 0.34, + "grad_norm": 2.653493642807007, + "learning_rate": 7.655264171240105e-06, + "loss": 0.6805, + "step": 2671 + }, + { + "epoch": 0.34, + "grad_norm": 1.030552625656128, + "learning_rate": 7.653505690940522e-06, + "loss": 0.5611, + "step": 2672 + }, + { + "epoch": 0.34, + "grad_norm": 1.2763005495071411, + "learning_rate": 7.65174675362852e-06, + "loss": 0.6387, + "step": 2673 + }, + { + "epoch": 0.34, + "grad_norm": 1.7102898359298706, + "learning_rate": 7.649987359607039e-06, + "loss": 0.6756, + "step": 2674 + }, + { + "epoch": 0.34, + "grad_norm": 1.2029314041137695, + "learning_rate": 7.648227509179095e-06, + "loss": 0.7813, + "step": 2675 + }, + { + "epoch": 0.34, + "grad_norm": 1.1546415090560913, + "learning_rate": 7.646467202647794e-06, + "loss": 0.6681, + "step": 2676 + }, + { + "epoch": 0.34, + "grad_norm": 1.1073461771011353, + "learning_rate": 7.644706440316308e-06, + "loss": 0.6578, + "step": 2677 + }, + { + "epoch": 0.34, + "grad_norm": 1.2768898010253906, + "learning_rate": 7.642945222487892e-06, + "loss": 0.7158, + "step": 2678 + }, + { + "epoch": 0.34, + "grad_norm": 1.2239513397216797, + "learning_rate": 7.641183549465881e-06, + "loss": 0.6005, + "step": 2679 + }, + { + "epoch": 0.34, + "grad_norm": 1.0863401889801025, + "learning_rate": 7.639421421553687e-06, + "loss": 0.6159, + "step": 2680 + }, + { + "epoch": 0.34, + "grad_norm": 1.596559762954712, + "learning_rate": 7.637658839054805e-06, + "loss": 0.6288, + "step": 2681 + }, + { + "epoch": 0.34, + "grad_norm": 1.3041114807128906, + "learning_rate": 7.635895802272796e-06, + "loss": 0.5878, + "step": 2682 + }, + { + "epoch": 0.34, + "grad_norm": 1.0410126447677612, + "learning_rate": 7.634132311511316e-06, + "loss": 0.5978, + "step": 2683 + }, + { + "epoch": 0.34, + "grad_norm": 0.9961788654327393, + "learning_rate": 7.632368367074083e-06, + "loss": 0.5053, + "step": 2684 + }, + { + "epoch": 0.34, + "grad_norm": 1.2974720001220703, + "learning_rate": 7.630603969264905e-06, + "loss": 0.6118, + "step": 2685 + }, + { + "epoch": 0.34, + "grad_norm": 1.1592649221420288, + "learning_rate": 7.628839118387662e-06, + "loss": 0.6189, + "step": 2686 + }, + { + "epoch": 0.34, + "grad_norm": 1.093314528465271, + "learning_rate": 7.627073814746315e-06, + "loss": 0.6362, + "step": 2687 + }, + { + "epoch": 0.34, + "grad_norm": 1.052543044090271, + "learning_rate": 7.625308058644898e-06, + "loss": 0.6143, + "step": 2688 + }, + { + "epoch": 0.34, + "grad_norm": 1.3054975271224976, + "learning_rate": 7.623541850387531e-06, + "loss": 0.6378, + "step": 2689 + }, + { + "epoch": 0.34, + "grad_norm": 1.0570430755615234, + "learning_rate": 7.621775190278407e-06, + "loss": 0.5995, + "step": 2690 + }, + { + "epoch": 0.34, + "grad_norm": 1.4440031051635742, + "learning_rate": 7.620008078621793e-06, + "loss": 0.6158, + "step": 2691 + }, + { + "epoch": 0.34, + "grad_norm": 1.3761212825775146, + "learning_rate": 7.618240515722044e-06, + "loss": 0.5676, + "step": 2692 + }, + { + "epoch": 0.35, + "grad_norm": 1.1331719160079956, + "learning_rate": 7.616472501883583e-06, + "loss": 0.6331, + "step": 2693 + }, + { + "epoch": 0.35, + "grad_norm": 1.8078550100326538, + "learning_rate": 7.614704037410915e-06, + "loss": 0.5285, + "step": 2694 + }, + { + "epoch": 0.35, + "grad_norm": 1.2421823740005493, + "learning_rate": 7.6129351226086225e-06, + "loss": 0.7476, + "step": 2695 + }, + { + "epoch": 0.35, + "grad_norm": 1.6171449422836304, + "learning_rate": 7.6111657577813644e-06, + "loss": 0.6853, + "step": 2696 + }, + { + "epoch": 0.35, + "grad_norm": 2.045614719390869, + "learning_rate": 7.609395943233877e-06, + "loss": 0.6106, + "step": 2697 + }, + { + "epoch": 0.35, + "grad_norm": 1.2099361419677734, + "learning_rate": 7.6076256792709805e-06, + "loss": 0.5939, + "step": 2698 + }, + { + "epoch": 0.35, + "grad_norm": 1.3107006549835205, + "learning_rate": 7.605854966197559e-06, + "loss": 0.6542, + "step": 2699 + }, + { + "epoch": 0.35, + "grad_norm": 1.4774034023284912, + "learning_rate": 7.604083804318587e-06, + "loss": 0.6179, + "step": 2700 + }, + { + "epoch": 0.35, + "grad_norm": 1.5129690170288086, + "learning_rate": 7.60231219393911e-06, + "loss": 0.6467, + "step": 2701 + }, + { + "epoch": 0.35, + "grad_norm": 1.1982402801513672, + "learning_rate": 7.600540135364252e-06, + "loss": 0.6278, + "step": 2702 + }, + { + "epoch": 0.35, + "grad_norm": 1.2928173542022705, + "learning_rate": 7.598767628899213e-06, + "loss": 0.7124, + "step": 2703 + }, + { + "epoch": 0.35, + "grad_norm": 1.1297682523727417, + "learning_rate": 7.596994674849272e-06, + "loss": 0.5936, + "step": 2704 + }, + { + "epoch": 0.35, + "grad_norm": 1.0973412990570068, + "learning_rate": 7.595221273519784e-06, + "loss": 0.7179, + "step": 2705 + }, + { + "epoch": 0.35, + "grad_norm": 1.5251829624176025, + "learning_rate": 7.59344742521618e-06, + "loss": 0.6319, + "step": 2706 + }, + { + "epoch": 0.35, + "grad_norm": 1.0643885135650635, + "learning_rate": 7.591673130243973e-06, + "loss": 0.6339, + "step": 2707 + }, + { + "epoch": 0.35, + "grad_norm": 1.0274592638015747, + "learning_rate": 7.589898388908745e-06, + "loss": 0.6236, + "step": 2708 + }, + { + "epoch": 0.35, + "grad_norm": 1.272946834564209, + "learning_rate": 7.588123201516164e-06, + "loss": 0.6329, + "step": 2709 + }, + { + "epoch": 0.35, + "grad_norm": 1.2281945943832397, + "learning_rate": 7.5863475683719655e-06, + "loss": 0.5959, + "step": 2710 + }, + { + "epoch": 0.35, + "grad_norm": 1.1847566366195679, + "learning_rate": 7.584571489781968e-06, + "loss": 0.5932, + "step": 2711 + }, + { + "epoch": 0.35, + "grad_norm": 1.0989586114883423, + "learning_rate": 7.582794966052064e-06, + "loss": 0.6339, + "step": 2712 + }, + { + "epoch": 0.35, + "grad_norm": 1.114815354347229, + "learning_rate": 7.581017997488225e-06, + "loss": 0.5454, + "step": 2713 + }, + { + "epoch": 0.35, + "grad_norm": 2.860734224319458, + "learning_rate": 7.579240584396497e-06, + "loss": 0.5895, + "step": 2714 + }, + { + "epoch": 0.35, + "grad_norm": 1.2328461408615112, + "learning_rate": 7.577462727083002e-06, + "loss": 0.6297, + "step": 2715 + }, + { + "epoch": 0.35, + "grad_norm": 1.1195461750030518, + "learning_rate": 7.575684425853944e-06, + "loss": 0.5487, + "step": 2716 + }, + { + "epoch": 0.35, + "grad_norm": 1.1979864835739136, + "learning_rate": 7.573905681015594e-06, + "loss": 0.6874, + "step": 2717 + }, + { + "epoch": 0.35, + "grad_norm": 1.1667231321334839, + "learning_rate": 7.57212649287431e-06, + "loss": 0.5755, + "step": 2718 + }, + { + "epoch": 0.35, + "grad_norm": 2.0767219066619873, + "learning_rate": 7.570346861736515e-06, + "loss": 0.6358, + "step": 2719 + }, + { + "epoch": 0.35, + "grad_norm": 1.3232417106628418, + "learning_rate": 7.568566787908719e-06, + "loss": 0.5362, + "step": 2720 + }, + { + "epoch": 0.35, + "grad_norm": 1.3619623184204102, + "learning_rate": 7.5667862716975e-06, + "loss": 0.6142, + "step": 2721 + }, + { + "epoch": 0.35, + "grad_norm": 1.1791231632232666, + "learning_rate": 7.565005313409519e-06, + "loss": 0.5849, + "step": 2722 + }, + { + "epoch": 0.35, + "grad_norm": 1.304181694984436, + "learning_rate": 7.563223913351507e-06, + "loss": 0.6559, + "step": 2723 + }, + { + "epoch": 0.35, + "grad_norm": 1.1328728199005127, + "learning_rate": 7.561442071830279e-06, + "loss": 0.5632, + "step": 2724 + }, + { + "epoch": 0.35, + "grad_norm": 1.1923437118530273, + "learning_rate": 7.559659789152714e-06, + "loss": 0.5817, + "step": 2725 + }, + { + "epoch": 0.35, + "grad_norm": 1.4104726314544678, + "learning_rate": 7.557877065625778e-06, + "loss": 0.7269, + "step": 2726 + }, + { + "epoch": 0.35, + "grad_norm": 1.1886721849441528, + "learning_rate": 7.5560939015565084e-06, + "loss": 0.524, + "step": 2727 + }, + { + "epoch": 0.35, + "grad_norm": 1.6470292806625366, + "learning_rate": 7.554310297252019e-06, + "loss": 0.6261, + "step": 2728 + }, + { + "epoch": 0.35, + "grad_norm": 1.882947564125061, + "learning_rate": 7.552526253019499e-06, + "loss": 0.6656, + "step": 2729 + }, + { + "epoch": 0.35, + "grad_norm": 1.4039462804794312, + "learning_rate": 7.550741769166215e-06, + "loss": 0.6302, + "step": 2730 + }, + { + "epoch": 0.35, + "grad_norm": 2.5081090927124023, + "learning_rate": 7.548956845999504e-06, + "loss": 0.652, + "step": 2731 + }, + { + "epoch": 0.35, + "grad_norm": 1.2422338724136353, + "learning_rate": 7.547171483826788e-06, + "loss": 0.6617, + "step": 2732 + }, + { + "epoch": 0.35, + "grad_norm": 1.51920485496521, + "learning_rate": 7.545385682955558e-06, + "loss": 0.5829, + "step": 2733 + }, + { + "epoch": 0.35, + "grad_norm": 1.3961273431777954, + "learning_rate": 7.543599443693379e-06, + "loss": 0.6472, + "step": 2734 + }, + { + "epoch": 0.35, + "grad_norm": 1.0059709548950195, + "learning_rate": 7.541812766347898e-06, + "loss": 0.6372, + "step": 2735 + }, + { + "epoch": 0.35, + "grad_norm": 1.0821688175201416, + "learning_rate": 7.540025651226832e-06, + "loss": 0.618, + "step": 2736 + }, + { + "epoch": 0.35, + "grad_norm": 1.056602954864502, + "learning_rate": 7.538238098637976e-06, + "loss": 0.5796, + "step": 2737 + }, + { + "epoch": 0.35, + "grad_norm": 1.2009097337722778, + "learning_rate": 7.536450108889197e-06, + "loss": 0.5918, + "step": 2738 + }, + { + "epoch": 0.35, + "grad_norm": 1.1832275390625, + "learning_rate": 7.534661682288444e-06, + "loss": 0.6911, + "step": 2739 + }, + { + "epoch": 0.35, + "grad_norm": 1.2712024450302124, + "learning_rate": 7.532872819143735e-06, + "loss": 0.57, + "step": 2740 + }, + { + "epoch": 0.35, + "grad_norm": 1.5780545473098755, + "learning_rate": 7.531083519763164e-06, + "loss": 0.5392, + "step": 2741 + }, + { + "epoch": 0.35, + "grad_norm": 1.6948215961456299, + "learning_rate": 7.5292937844549045e-06, + "loss": 0.6263, + "step": 2742 + }, + { + "epoch": 0.35, + "grad_norm": 1.145394206047058, + "learning_rate": 7.527503613527198e-06, + "loss": 0.5929, + "step": 2743 + }, + { + "epoch": 0.35, + "grad_norm": 1.0543674230575562, + "learning_rate": 7.525713007288371e-06, + "loss": 0.5308, + "step": 2744 + }, + { + "epoch": 0.35, + "grad_norm": 1.533777117729187, + "learning_rate": 7.523921966046813e-06, + "loss": 0.6327, + "step": 2745 + }, + { + "epoch": 0.35, + "grad_norm": 1.6052230596542358, + "learning_rate": 7.522130490110999e-06, + "loss": 0.6528, + "step": 2746 + }, + { + "epoch": 0.35, + "grad_norm": 1.4751596450805664, + "learning_rate": 7.52033857978947e-06, + "loss": 0.6004, + "step": 2747 + }, + { + "epoch": 0.35, + "grad_norm": 1.4990054368972778, + "learning_rate": 7.51854623539085e-06, + "loss": 0.5579, + "step": 2748 + }, + { + "epoch": 0.35, + "grad_norm": 1.8240008354187012, + "learning_rate": 7.516753457223831e-06, + "loss": 0.6331, + "step": 2749 + }, + { + "epoch": 0.35, + "grad_norm": 1.3702977895736694, + "learning_rate": 7.514960245597185e-06, + "loss": 0.6204, + "step": 2750 + }, + { + "epoch": 0.35, + "grad_norm": 1.5224536657333374, + "learning_rate": 7.5131666008197546e-06, + "loss": 0.6065, + "step": 2751 + }, + { + "epoch": 0.35, + "grad_norm": 1.307070016860962, + "learning_rate": 7.51137252320046e-06, + "loss": 0.7614, + "step": 2752 + }, + { + "epoch": 0.35, + "grad_norm": 1.2971265316009521, + "learning_rate": 7.509578013048293e-06, + "loss": 0.5903, + "step": 2753 + }, + { + "epoch": 0.35, + "grad_norm": 1.0652785301208496, + "learning_rate": 7.507783070672323e-06, + "loss": 0.6088, + "step": 2754 + }, + { + "epoch": 0.35, + "grad_norm": 1.1220684051513672, + "learning_rate": 7.505987696381692e-06, + "loss": 0.6447, + "step": 2755 + }, + { + "epoch": 0.35, + "grad_norm": 1.2321696281433105, + "learning_rate": 7.504191890485615e-06, + "loss": 0.6243, + "step": 2756 + }, + { + "epoch": 0.35, + "grad_norm": 1.2906885147094727, + "learning_rate": 7.502395653293385e-06, + "loss": 0.6823, + "step": 2757 + }, + { + "epoch": 0.35, + "grad_norm": 1.1404194831848145, + "learning_rate": 7.500598985114367e-06, + "loss": 0.54, + "step": 2758 + }, + { + "epoch": 0.35, + "grad_norm": 1.316081166267395, + "learning_rate": 7.498801886258001e-06, + "loss": 0.597, + "step": 2759 + }, + { + "epoch": 0.35, + "grad_norm": 1.2815004587173462, + "learning_rate": 7.497004357033799e-06, + "loss": 0.6086, + "step": 2760 + }, + { + "epoch": 0.35, + "grad_norm": 1.0035674571990967, + "learning_rate": 7.495206397751351e-06, + "loss": 0.6706, + "step": 2761 + }, + { + "epoch": 0.35, + "grad_norm": 2.31266188621521, + "learning_rate": 7.493408008720317e-06, + "loss": 0.5874, + "step": 2762 + }, + { + "epoch": 0.35, + "grad_norm": 1.275524377822876, + "learning_rate": 7.491609190250435e-06, + "loss": 0.5634, + "step": 2763 + }, + { + "epoch": 0.35, + "grad_norm": 1.0364302396774292, + "learning_rate": 7.489809942651514e-06, + "loss": 0.5669, + "step": 2764 + }, + { + "epoch": 0.35, + "grad_norm": 1.401902198791504, + "learning_rate": 7.488010266233436e-06, + "loss": 0.6698, + "step": 2765 + }, + { + "epoch": 0.35, + "grad_norm": 1.1657154560089111, + "learning_rate": 7.486210161306161e-06, + "loss": 0.585, + "step": 2766 + }, + { + "epoch": 0.35, + "grad_norm": 1.433221697807312, + "learning_rate": 7.48440962817972e-06, + "loss": 0.5722, + "step": 2767 + }, + { + "epoch": 0.35, + "grad_norm": 1.1307737827301025, + "learning_rate": 7.482608667164218e-06, + "loss": 0.6006, + "step": 2768 + }, + { + "epoch": 0.35, + "grad_norm": 1.26017165184021, + "learning_rate": 7.480807278569835e-06, + "loss": 0.7052, + "step": 2769 + }, + { + "epoch": 0.35, + "grad_norm": 1.2488316297531128, + "learning_rate": 7.479005462706821e-06, + "loss": 0.5859, + "step": 2770 + }, + { + "epoch": 0.36, + "grad_norm": 1.1443097591400146, + "learning_rate": 7.477203219885505e-06, + "loss": 0.632, + "step": 2771 + }, + { + "epoch": 0.36, + "grad_norm": 1.0260684490203857, + "learning_rate": 7.4754005504162855e-06, + "loss": 0.6807, + "step": 2772 + }, + { + "epoch": 0.36, + "grad_norm": 1.0291800498962402, + "learning_rate": 7.4735974546096335e-06, + "loss": 0.6448, + "step": 2773 + }, + { + "epoch": 0.36, + "grad_norm": 1.8409909009933472, + "learning_rate": 7.4717939327761005e-06, + "loss": 0.6562, + "step": 2774 + }, + { + "epoch": 0.36, + "grad_norm": 1.0962021350860596, + "learning_rate": 7.4699899852263e-06, + "loss": 0.6475, + "step": 2775 + }, + { + "epoch": 0.36, + "grad_norm": 1.6759026050567627, + "learning_rate": 7.468185612270932e-06, + "loss": 0.6217, + "step": 2776 + }, + { + "epoch": 0.36, + "grad_norm": 1.2551426887512207, + "learning_rate": 7.466380814220758e-06, + "loss": 0.6965, + "step": 2777 + }, + { + "epoch": 0.36, + "grad_norm": 1.2845110893249512, + "learning_rate": 7.464575591386623e-06, + "loss": 0.6527, + "step": 2778 + }, + { + "epoch": 0.36, + "grad_norm": 1.096612572669983, + "learning_rate": 7.462769944079433e-06, + "loss": 0.6857, + "step": 2779 + }, + { + "epoch": 0.36, + "grad_norm": 1.328682780265808, + "learning_rate": 7.460963872610181e-06, + "loss": 0.6129, + "step": 2780 + }, + { + "epoch": 0.36, + "grad_norm": 1.3041599988937378, + "learning_rate": 7.45915737728992e-06, + "loss": 0.6, + "step": 2781 + }, + { + "epoch": 0.36, + "grad_norm": 1.4073518514633179, + "learning_rate": 7.457350458429788e-06, + "loss": 0.697, + "step": 2782 + }, + { + "epoch": 0.36, + "grad_norm": 1.2230504751205444, + "learning_rate": 7.455543116340985e-06, + "loss": 0.665, + "step": 2783 + }, + { + "epoch": 0.36, + "grad_norm": 1.3002861738204956, + "learning_rate": 7.453735351334791e-06, + "loss": 0.6305, + "step": 2784 + }, + { + "epoch": 0.36, + "grad_norm": 1.2461925745010376, + "learning_rate": 7.451927163722557e-06, + "loss": 0.6243, + "step": 2785 + }, + { + "epoch": 0.36, + "grad_norm": 1.6128621101379395, + "learning_rate": 7.450118553815707e-06, + "loss": 0.6508, + "step": 2786 + }, + { + "epoch": 0.36, + "grad_norm": 1.3560428619384766, + "learning_rate": 7.448309521925737e-06, + "loss": 0.6631, + "step": 2787 + }, + { + "epoch": 0.36, + "grad_norm": 1.214652180671692, + "learning_rate": 7.4465000683642144e-06, + "loss": 0.6243, + "step": 2788 + }, + { + "epoch": 0.36, + "grad_norm": 2.2300350666046143, + "learning_rate": 7.444690193442783e-06, + "loss": 0.6499, + "step": 2789 + }, + { + "epoch": 0.36, + "grad_norm": 1.3689312934875488, + "learning_rate": 7.442879897473156e-06, + "loss": 0.6187, + "step": 2790 + }, + { + "epoch": 0.36, + "grad_norm": 1.1015273332595825, + "learning_rate": 7.441069180767119e-06, + "loss": 0.5566, + "step": 2791 + }, + { + "epoch": 0.36, + "grad_norm": 1.6342823505401611, + "learning_rate": 7.439258043636532e-06, + "loss": 0.5791, + "step": 2792 + }, + { + "epoch": 0.36, + "grad_norm": 1.212996482849121, + "learning_rate": 7.437446486393327e-06, + "loss": 0.535, + "step": 2793 + }, + { + "epoch": 0.36, + "grad_norm": 1.1164084672927856, + "learning_rate": 7.435634509349509e-06, + "loss": 0.7325, + "step": 2794 + }, + { + "epoch": 0.36, + "grad_norm": 1.421766996383667, + "learning_rate": 7.433822112817151e-06, + "loss": 0.5267, + "step": 2795 + }, + { + "epoch": 0.36, + "grad_norm": 1.4409089088439941, + "learning_rate": 7.4320092971084044e-06, + "loss": 0.5939, + "step": 2796 + }, + { + "epoch": 0.36, + "grad_norm": 1.0344882011413574, + "learning_rate": 7.430196062535488e-06, + "loss": 0.5837, + "step": 2797 + }, + { + "epoch": 0.36, + "grad_norm": 1.0946214199066162, + "learning_rate": 7.428382409410697e-06, + "loss": 0.5718, + "step": 2798 + }, + { + "epoch": 0.36, + "grad_norm": 1.3646178245544434, + "learning_rate": 7.426568338046394e-06, + "loss": 0.5607, + "step": 2799 + }, + { + "epoch": 0.36, + "grad_norm": 1.1723817586898804, + "learning_rate": 7.4247538487550154e-06, + "loss": 0.6713, + "step": 2800 + }, + { + "epoch": 0.36, + "grad_norm": 1.326438546180725, + "learning_rate": 7.422938941849073e-06, + "loss": 0.6182, + "step": 2801 + }, + { + "epoch": 0.36, + "grad_norm": 1.0464811325073242, + "learning_rate": 7.421123617641145e-06, + "loss": 0.5515, + "step": 2802 + }, + { + "epoch": 0.36, + "grad_norm": 1.1844979524612427, + "learning_rate": 7.4193078764438855e-06, + "loss": 0.5557, + "step": 2803 + }, + { + "epoch": 0.36, + "grad_norm": 1.2062512636184692, + "learning_rate": 7.417491718570018e-06, + "loss": 0.6877, + "step": 2804 + }, + { + "epoch": 0.36, + "grad_norm": 1.0828931331634521, + "learning_rate": 7.41567514433234e-06, + "loss": 0.6256, + "step": 2805 + }, + { + "epoch": 0.36, + "grad_norm": 1.0971540212631226, + "learning_rate": 7.413858154043718e-06, + "loss": 0.5836, + "step": 2806 + }, + { + "epoch": 0.36, + "grad_norm": 1.0136876106262207, + "learning_rate": 7.412040748017094e-06, + "loss": 0.6453, + "step": 2807 + }, + { + "epoch": 0.36, + "grad_norm": 1.2080310583114624, + "learning_rate": 7.410222926565477e-06, + "loss": 0.695, + "step": 2808 + }, + { + "epoch": 0.36, + "grad_norm": 1.1714882850646973, + "learning_rate": 7.408404690001949e-06, + "loss": 0.6441, + "step": 2809 + }, + { + "epoch": 0.36, + "grad_norm": 1.214609980583191, + "learning_rate": 7.406586038639667e-06, + "loss": 0.6754, + "step": 2810 + }, + { + "epoch": 0.36, + "grad_norm": 1.2446070909500122, + "learning_rate": 7.404766972791856e-06, + "loss": 0.5658, + "step": 2811 + }, + { + "epoch": 0.36, + "grad_norm": 1.5085375308990479, + "learning_rate": 7.402947492771811e-06, + "loss": 0.6991, + "step": 2812 + }, + { + "epoch": 0.36, + "grad_norm": 2.154221534729004, + "learning_rate": 7.4011275988929046e-06, + "loss": 0.6379, + "step": 2813 + }, + { + "epoch": 0.36, + "grad_norm": 1.2423889636993408, + "learning_rate": 7.399307291468572e-06, + "loss": 0.6439, + "step": 2814 + }, + { + "epoch": 0.36, + "grad_norm": 1.5794448852539062, + "learning_rate": 7.3974865708123276e-06, + "loss": 0.6704, + "step": 2815 + }, + { + "epoch": 0.36, + "grad_norm": 1.1966044902801514, + "learning_rate": 7.395665437237751e-06, + "loss": 0.6989, + "step": 2816 + }, + { + "epoch": 0.36, + "grad_norm": 1.102135419845581, + "learning_rate": 7.3938438910584964e-06, + "loss": 0.6813, + "step": 2817 + }, + { + "epoch": 0.36, + "grad_norm": 1.2617212533950806, + "learning_rate": 7.392021932588289e-06, + "loss": 0.6861, + "step": 2818 + }, + { + "epoch": 0.36, + "grad_norm": 1.1992825269699097, + "learning_rate": 7.3901995621409224e-06, + "loss": 0.5791, + "step": 2819 + }, + { + "epoch": 0.36, + "grad_norm": 1.2392593622207642, + "learning_rate": 7.3883767800302665e-06, + "loss": 0.7588, + "step": 2820 + }, + { + "epoch": 0.36, + "grad_norm": 1.4153352975845337, + "learning_rate": 7.386553586570253e-06, + "loss": 0.62, + "step": 2821 + }, + { + "epoch": 0.36, + "grad_norm": 1.2339427471160889, + "learning_rate": 7.384729982074894e-06, + "loss": 0.6189, + "step": 2822 + }, + { + "epoch": 0.36, + "grad_norm": 1.1156781911849976, + "learning_rate": 7.382905966858267e-06, + "loss": 0.57, + "step": 2823 + }, + { + "epoch": 0.36, + "grad_norm": 1.5094579458236694, + "learning_rate": 7.381081541234522e-06, + "loss": 0.6152, + "step": 2824 + }, + { + "epoch": 0.36, + "grad_norm": 1.3721705675125122, + "learning_rate": 7.3792567055178766e-06, + "loss": 0.5664, + "step": 2825 + }, + { + "epoch": 0.36, + "grad_norm": 1.3138307332992554, + "learning_rate": 7.377431460022625e-06, + "loss": 0.6344, + "step": 2826 + }, + { + "epoch": 0.36, + "grad_norm": 1.6809388399124146, + "learning_rate": 7.375605805063128e-06, + "loss": 0.5832, + "step": 2827 + }, + { + "epoch": 0.36, + "grad_norm": 1.2338505983352661, + "learning_rate": 7.373779740953817e-06, + "loss": 0.5376, + "step": 2828 + }, + { + "epoch": 0.36, + "grad_norm": 1.1951864957809448, + "learning_rate": 7.371953268009194e-06, + "loss": 0.5897, + "step": 2829 + }, + { + "epoch": 0.36, + "grad_norm": 1.2772202491760254, + "learning_rate": 7.370126386543833e-06, + "loss": 0.6407, + "step": 2830 + }, + { + "epoch": 0.36, + "grad_norm": 1.6042275428771973, + "learning_rate": 7.368299096872377e-06, + "loss": 0.6627, + "step": 2831 + }, + { + "epoch": 0.36, + "grad_norm": 1.3527517318725586, + "learning_rate": 7.366471399309538e-06, + "loss": 0.589, + "step": 2832 + }, + { + "epoch": 0.36, + "grad_norm": 1.1194247007369995, + "learning_rate": 7.3646432941701015e-06, + "loss": 0.6234, + "step": 2833 + }, + { + "epoch": 0.36, + "grad_norm": 1.0248486995697021, + "learning_rate": 7.362814781768919e-06, + "loss": 0.551, + "step": 2834 + }, + { + "epoch": 0.36, + "grad_norm": 1.1566435098648071, + "learning_rate": 7.360985862420919e-06, + "loss": 0.6106, + "step": 2835 + }, + { + "epoch": 0.36, + "grad_norm": 1.258584976196289, + "learning_rate": 7.359156536441091e-06, + "loss": 0.6558, + "step": 2836 + }, + { + "epoch": 0.36, + "grad_norm": 1.1506608724594116, + "learning_rate": 7.357326804144502e-06, + "loss": 0.6423, + "step": 2837 + }, + { + "epoch": 0.36, + "grad_norm": 1.1835181713104248, + "learning_rate": 7.355496665846285e-06, + "loss": 0.672, + "step": 2838 + }, + { + "epoch": 0.36, + "grad_norm": 1.3529503345489502, + "learning_rate": 7.353666121861646e-06, + "loss": 0.5715, + "step": 2839 + }, + { + "epoch": 0.36, + "grad_norm": 1.514220952987671, + "learning_rate": 7.3518351725058555e-06, + "loss": 0.637, + "step": 2840 + }, + { + "epoch": 0.36, + "grad_norm": 1.547484278678894, + "learning_rate": 7.350003818094261e-06, + "loss": 0.6553, + "step": 2841 + }, + { + "epoch": 0.36, + "grad_norm": 2.531700849533081, + "learning_rate": 7.348172058942274e-06, + "loss": 0.6472, + "step": 2842 + }, + { + "epoch": 0.36, + "grad_norm": 1.1948038339614868, + "learning_rate": 7.346339895365378e-06, + "loss": 0.6535, + "step": 2843 + }, + { + "epoch": 0.36, + "grad_norm": 1.2925516366958618, + "learning_rate": 7.344507327679127e-06, + "loss": 0.5542, + "step": 2844 + }, + { + "epoch": 0.36, + "grad_norm": 1.377945899963379, + "learning_rate": 7.342674356199141e-06, + "loss": 0.687, + "step": 2845 + }, + { + "epoch": 0.36, + "grad_norm": 1.2123630046844482, + "learning_rate": 7.340840981241115e-06, + "loss": 0.5739, + "step": 2846 + }, + { + "epoch": 0.36, + "grad_norm": 1.087882161140442, + "learning_rate": 7.339007203120809e-06, + "loss": 0.6949, + "step": 2847 + }, + { + "epoch": 0.36, + "grad_norm": 1.039701223373413, + "learning_rate": 7.3371730221540545e-06, + "loss": 0.577, + "step": 2848 + }, + { + "epoch": 0.36, + "grad_norm": 1.535933494567871, + "learning_rate": 7.335338438656752e-06, + "loss": 0.6101, + "step": 2849 + }, + { + "epoch": 0.37, + "grad_norm": 1.4305620193481445, + "learning_rate": 7.333503452944872e-06, + "loss": 0.5939, + "step": 2850 + }, + { + "epoch": 0.37, + "grad_norm": 1.1246757507324219, + "learning_rate": 7.331668065334449e-06, + "loss": 0.6691, + "step": 2851 + }, + { + "epoch": 0.37, + "grad_norm": 1.2173248529434204, + "learning_rate": 7.329832276141597e-06, + "loss": 0.7914, + "step": 2852 + }, + { + "epoch": 0.37, + "grad_norm": 1.0413439273834229, + "learning_rate": 7.327996085682491e-06, + "loss": 0.6105, + "step": 2853 + }, + { + "epoch": 0.37, + "grad_norm": 1.1989972591400146, + "learning_rate": 7.326159494273377e-06, + "loss": 0.7641, + "step": 2854 + }, + { + "epoch": 0.37, + "grad_norm": 1.2141451835632324, + "learning_rate": 7.324322502230571e-06, + "loss": 0.6149, + "step": 2855 + }, + { + "epoch": 0.37, + "grad_norm": 1.3219760656356812, + "learning_rate": 7.322485109870458e-06, + "loss": 0.5544, + "step": 2856 + }, + { + "epoch": 0.37, + "grad_norm": 1.2472971677780151, + "learning_rate": 7.320647317509493e-06, + "loss": 0.628, + "step": 2857 + }, + { + "epoch": 0.37, + "grad_norm": 1.1860524415969849, + "learning_rate": 7.318809125464194e-06, + "loss": 0.5704, + "step": 2858 + }, + { + "epoch": 0.37, + "grad_norm": 1.7199898958206177, + "learning_rate": 7.316970534051155e-06, + "loss": 0.5817, + "step": 2859 + }, + { + "epoch": 0.37, + "grad_norm": 1.1000601053237915, + "learning_rate": 7.315131543587035e-06, + "loss": 0.5683, + "step": 2860 + }, + { + "epoch": 0.37, + "grad_norm": 1.0615863800048828, + "learning_rate": 7.3132921543885646e-06, + "loss": 0.5399, + "step": 2861 + }, + { + "epoch": 0.37, + "grad_norm": 1.2328920364379883, + "learning_rate": 7.31145236677254e-06, + "loss": 0.6103, + "step": 2862 + }, + { + "epoch": 0.37, + "grad_norm": 1.1900357007980347, + "learning_rate": 7.309612181055827e-06, + "loss": 0.5467, + "step": 2863 + }, + { + "epoch": 0.37, + "grad_norm": 1.1210532188415527, + "learning_rate": 7.30777159755536e-06, + "loss": 0.6925, + "step": 2864 + }, + { + "epoch": 0.37, + "grad_norm": 1.3341383934020996, + "learning_rate": 7.305930616588144e-06, + "loss": 0.5621, + "step": 2865 + }, + { + "epoch": 0.37, + "grad_norm": 1.700534701347351, + "learning_rate": 7.304089238471248e-06, + "loss": 0.695, + "step": 2866 + }, + { + "epoch": 0.37, + "grad_norm": 1.206648588180542, + "learning_rate": 7.302247463521813e-06, + "loss": 0.59, + "step": 2867 + }, + { + "epoch": 0.37, + "grad_norm": 1.0011547803878784, + "learning_rate": 7.300405292057048e-06, + "loss": 0.5618, + "step": 2868 + }, + { + "epoch": 0.37, + "grad_norm": 1.4958430528640747, + "learning_rate": 7.298562724394227e-06, + "loss": 0.4853, + "step": 2869 + }, + { + "epoch": 0.37, + "grad_norm": 1.4243090152740479, + "learning_rate": 7.296719760850699e-06, + "loss": 0.7581, + "step": 2870 + }, + { + "epoch": 0.37, + "grad_norm": 1.1251380443572998, + "learning_rate": 7.294876401743873e-06, + "loss": 0.6223, + "step": 2871 + }, + { + "epoch": 0.37, + "grad_norm": 1.3010609149932861, + "learning_rate": 7.293032647391234e-06, + "loss": 0.635, + "step": 2872 + }, + { + "epoch": 0.37, + "grad_norm": 1.1540263891220093, + "learning_rate": 7.291188498110328e-06, + "loss": 0.611, + "step": 2873 + }, + { + "epoch": 0.37, + "grad_norm": 2.1613638401031494, + "learning_rate": 7.289343954218772e-06, + "loss": 0.6606, + "step": 2874 + }, + { + "epoch": 0.37, + "grad_norm": 1.032372236251831, + "learning_rate": 7.287499016034255e-06, + "loss": 0.6623, + "step": 2875 + }, + { + "epoch": 0.37, + "grad_norm": 1.2156139612197876, + "learning_rate": 7.285653683874527e-06, + "loss": 0.6258, + "step": 2876 + }, + { + "epoch": 0.37, + "grad_norm": 1.3962489366531372, + "learning_rate": 7.2838079580574076e-06, + "loss": 0.5691, + "step": 2877 + }, + { + "epoch": 0.37, + "grad_norm": 1.1535193920135498, + "learning_rate": 7.28196183890079e-06, + "loss": 0.6384, + "step": 2878 + }, + { + "epoch": 0.37, + "grad_norm": 1.083708643913269, + "learning_rate": 7.2801153267226266e-06, + "loss": 0.5573, + "step": 2879 + }, + { + "epoch": 0.37, + "grad_norm": 1.3152596950531006, + "learning_rate": 7.278268421840944e-06, + "loss": 0.6413, + "step": 2880 + }, + { + "epoch": 0.37, + "grad_norm": 1.2211905717849731, + "learning_rate": 7.276421124573834e-06, + "loss": 0.608, + "step": 2881 + }, + { + "epoch": 0.37, + "grad_norm": 1.245803713798523, + "learning_rate": 7.274573435239454e-06, + "loss": 0.555, + "step": 2882 + }, + { + "epoch": 0.37, + "grad_norm": 1.663479208946228, + "learning_rate": 7.272725354156034e-06, + "loss": 0.5815, + "step": 2883 + }, + { + "epoch": 0.37, + "grad_norm": 1.306326150894165, + "learning_rate": 7.270876881641864e-06, + "loss": 0.5868, + "step": 2884 + }, + { + "epoch": 0.37, + "grad_norm": 1.315184235572815, + "learning_rate": 7.269028018015311e-06, + "loss": 0.6078, + "step": 2885 + }, + { + "epoch": 0.37, + "grad_norm": 1.3038302659988403, + "learning_rate": 7.267178763594797e-06, + "loss": 0.6068, + "step": 2886 + }, + { + "epoch": 0.37, + "grad_norm": 1.1452289819717407, + "learning_rate": 7.265329118698828e-06, + "loss": 0.5734, + "step": 2887 + }, + { + "epoch": 0.37, + "grad_norm": 1.0434083938598633, + "learning_rate": 7.263479083645961e-06, + "loss": 0.6097, + "step": 2888 + }, + { + "epoch": 0.37, + "grad_norm": 1.2132070064544678, + "learning_rate": 7.2616286587548305e-06, + "loss": 0.5651, + "step": 2889 + }, + { + "epoch": 0.37, + "grad_norm": 1.0834686756134033, + "learning_rate": 7.25977784434413e-06, + "loss": 0.6347, + "step": 2890 + }, + { + "epoch": 0.37, + "grad_norm": 1.3891212940216064, + "learning_rate": 7.25792664073263e-06, + "loss": 0.5851, + "step": 2891 + }, + { + "epoch": 0.37, + "grad_norm": 1.2323148250579834, + "learning_rate": 7.256075048239158e-06, + "loss": 0.7112, + "step": 2892 + }, + { + "epoch": 0.37, + "grad_norm": 1.1010910272598267, + "learning_rate": 7.254223067182618e-06, + "loss": 0.6333, + "step": 2893 + }, + { + "epoch": 0.37, + "grad_norm": 1.0941414833068848, + "learning_rate": 7.252370697881971e-06, + "loss": 0.5294, + "step": 2894 + }, + { + "epoch": 0.37, + "grad_norm": 1.3933345079421997, + "learning_rate": 7.250517940656253e-06, + "loss": 0.6761, + "step": 2895 + }, + { + "epoch": 0.37, + "grad_norm": 1.3738077878952026, + "learning_rate": 7.248664795824565e-06, + "loss": 0.6789, + "step": 2896 + }, + { + "epoch": 0.37, + "grad_norm": 1.0183112621307373, + "learning_rate": 7.246811263706069e-06, + "loss": 0.5176, + "step": 2897 + }, + { + "epoch": 0.37, + "grad_norm": 1.2617154121398926, + "learning_rate": 7.244957344620003e-06, + "loss": 0.7175, + "step": 2898 + }, + { + "epoch": 0.37, + "grad_norm": 1.2727289199829102, + "learning_rate": 7.243103038885664e-06, + "loss": 0.6891, + "step": 2899 + }, + { + "epoch": 0.37, + "grad_norm": 1.2516908645629883, + "learning_rate": 7.241248346822421e-06, + "loss": 0.6941, + "step": 2900 + }, + { + "epoch": 0.37, + "grad_norm": 1.441074013710022, + "learning_rate": 7.2393932687497015e-06, + "loss": 0.6303, + "step": 2901 + }, + { + "epoch": 0.37, + "grad_norm": 1.305977702140808, + "learning_rate": 7.237537804987013e-06, + "loss": 0.7342, + "step": 2902 + }, + { + "epoch": 0.37, + "grad_norm": 1.399827480316162, + "learning_rate": 7.2356819558539124e-06, + "loss": 0.6366, + "step": 2903 + }, + { + "epoch": 0.37, + "grad_norm": 1.2844167947769165, + "learning_rate": 7.233825721670042e-06, + "loss": 0.5527, + "step": 2904 + }, + { + "epoch": 0.37, + "grad_norm": 1.3156801462173462, + "learning_rate": 7.231969102755093e-06, + "loss": 0.6165, + "step": 2905 + }, + { + "epoch": 0.37, + "grad_norm": 1.2262427806854248, + "learning_rate": 7.230112099428832e-06, + "loss": 0.578, + "step": 2906 + }, + { + "epoch": 0.37, + "grad_norm": 1.332987904548645, + "learning_rate": 7.228254712011091e-06, + "loss": 0.5501, + "step": 2907 + }, + { + "epoch": 0.37, + "grad_norm": 1.1426584720611572, + "learning_rate": 7.226396940821767e-06, + "loss": 0.5759, + "step": 2908 + }, + { + "epoch": 0.37, + "grad_norm": 1.636731505393982, + "learning_rate": 7.224538786180824e-06, + "loss": 0.6399, + "step": 2909 + }, + { + "epoch": 0.37, + "grad_norm": 1.7168676853179932, + "learning_rate": 7.222680248408289e-06, + "loss": 0.674, + "step": 2910 + }, + { + "epoch": 0.37, + "grad_norm": 1.3796486854553223, + "learning_rate": 7.22082132782426e-06, + "loss": 0.6683, + "step": 2911 + }, + { + "epoch": 0.37, + "grad_norm": 1.0893101692199707, + "learning_rate": 7.218962024748896e-06, + "loss": 0.5658, + "step": 2912 + }, + { + "epoch": 0.37, + "grad_norm": 1.2268693447113037, + "learning_rate": 7.217102339502426e-06, + "loss": 0.6162, + "step": 2913 + }, + { + "epoch": 0.37, + "grad_norm": 1.2177835702896118, + "learning_rate": 7.215242272405142e-06, + "loss": 0.6081, + "step": 2914 + }, + { + "epoch": 0.37, + "grad_norm": 1.2131762504577637, + "learning_rate": 7.213381823777404e-06, + "loss": 0.5518, + "step": 2915 + }, + { + "epoch": 0.37, + "grad_norm": 1.3434334993362427, + "learning_rate": 7.2115209939396345e-06, + "loss": 0.6446, + "step": 2916 + }, + { + "epoch": 0.37, + "grad_norm": 1.2754311561584473, + "learning_rate": 7.209659783212326e-06, + "loss": 0.6655, + "step": 2917 + }, + { + "epoch": 0.37, + "grad_norm": 1.3804233074188232, + "learning_rate": 7.207798191916031e-06, + "loss": 0.6211, + "step": 2918 + }, + { + "epoch": 0.37, + "grad_norm": 1.2941632270812988, + "learning_rate": 7.205936220371374e-06, + "loss": 0.5307, + "step": 2919 + }, + { + "epoch": 0.37, + "grad_norm": 1.4709851741790771, + "learning_rate": 7.20407386889904e-06, + "loss": 0.5958, + "step": 2920 + }, + { + "epoch": 0.37, + "grad_norm": 1.3792892694473267, + "learning_rate": 7.202211137819781e-06, + "loss": 0.6584, + "step": 2921 + }, + { + "epoch": 0.37, + "grad_norm": 1.2968083620071411, + "learning_rate": 7.2003480274544156e-06, + "loss": 0.5921, + "step": 2922 + }, + { + "epoch": 0.37, + "grad_norm": 1.2979252338409424, + "learning_rate": 7.198484538123826e-06, + "loss": 0.6601, + "step": 2923 + }, + { + "epoch": 0.37, + "grad_norm": 1.3300632238388062, + "learning_rate": 7.196620670148961e-06, + "loss": 0.7856, + "step": 2924 + }, + { + "epoch": 0.37, + "grad_norm": 1.1163116693496704, + "learning_rate": 7.194756423850833e-06, + "loss": 0.6127, + "step": 2925 + }, + { + "epoch": 0.37, + "grad_norm": 1.256772518157959, + "learning_rate": 7.192891799550522e-06, + "loss": 0.5977, + "step": 2926 + }, + { + "epoch": 0.37, + "grad_norm": 1.180251955986023, + "learning_rate": 7.19102679756917e-06, + "loss": 0.6473, + "step": 2927 + }, + { + "epoch": 0.38, + "grad_norm": 1.2117708921432495, + "learning_rate": 7.189161418227987e-06, + "loss": 0.6191, + "step": 2928 + }, + { + "epoch": 0.38, + "grad_norm": 1.150450587272644, + "learning_rate": 7.187295661848243e-06, + "loss": 0.7571, + "step": 2929 + }, + { + "epoch": 0.38, + "grad_norm": 1.3137658834457397, + "learning_rate": 7.185429528751285e-06, + "loss": 0.5918, + "step": 2930 + }, + { + "epoch": 0.38, + "grad_norm": 1.3359402418136597, + "learning_rate": 7.1835630192585085e-06, + "loss": 0.5988, + "step": 2931 + }, + { + "epoch": 0.38, + "grad_norm": 1.3789992332458496, + "learning_rate": 7.1816961336913835e-06, + "loss": 0.6197, + "step": 2932 + }, + { + "epoch": 0.38, + "grad_norm": 1.2217789888381958, + "learning_rate": 7.179828872371446e-06, + "loss": 0.5909, + "step": 2933 + }, + { + "epoch": 0.38, + "grad_norm": 1.4157116413116455, + "learning_rate": 7.17796123562029e-06, + "loss": 0.5482, + "step": 2934 + }, + { + "epoch": 0.38, + "grad_norm": 1.2854241132736206, + "learning_rate": 7.176093223759581e-06, + "loss": 0.5544, + "step": 2935 + }, + { + "epoch": 0.38, + "grad_norm": 1.2791540622711182, + "learning_rate": 7.174224837111044e-06, + "loss": 0.6145, + "step": 2936 + }, + { + "epoch": 0.38, + "grad_norm": 1.3771109580993652, + "learning_rate": 7.172356075996473e-06, + "loss": 0.6808, + "step": 2937 + }, + { + "epoch": 0.38, + "grad_norm": 1.5977014303207397, + "learning_rate": 7.170486940737722e-06, + "loss": 0.7052, + "step": 2938 + }, + { + "epoch": 0.38, + "grad_norm": 1.30767023563385, + "learning_rate": 7.168617431656713e-06, + "loss": 0.674, + "step": 2939 + }, + { + "epoch": 0.38, + "grad_norm": 1.5938276052474976, + "learning_rate": 7.166747549075428e-06, + "loss": 0.6467, + "step": 2940 + }, + { + "epoch": 0.38, + "grad_norm": 1.34315824508667, + "learning_rate": 7.164877293315921e-06, + "loss": 0.5739, + "step": 2941 + }, + { + "epoch": 0.38, + "grad_norm": 1.2254630327224731, + "learning_rate": 7.163006664700302e-06, + "loss": 0.6603, + "step": 2942 + }, + { + "epoch": 0.38, + "grad_norm": 1.1479889154434204, + "learning_rate": 7.16113566355075e-06, + "loss": 0.6333, + "step": 2943 + }, + { + "epoch": 0.38, + "grad_norm": 1.111104130744934, + "learning_rate": 7.159264290189506e-06, + "loss": 0.6006, + "step": 2944 + }, + { + "epoch": 0.38, + "grad_norm": 1.1808959245681763, + "learning_rate": 7.157392544938877e-06, + "loss": 0.5819, + "step": 2945 + }, + { + "epoch": 0.38, + "grad_norm": 1.5967776775360107, + "learning_rate": 7.155520428121233e-06, + "loss": 0.649, + "step": 2946 + }, + { + "epoch": 0.38, + "grad_norm": 1.1286388635635376, + "learning_rate": 7.153647940059007e-06, + "loss": 0.6107, + "step": 2947 + }, + { + "epoch": 0.38, + "grad_norm": 1.261406421661377, + "learning_rate": 7.1517750810746986e-06, + "loss": 0.6149, + "step": 2948 + }, + { + "epoch": 0.38, + "grad_norm": 1.5292327404022217, + "learning_rate": 7.149901851490869e-06, + "loss": 0.6509, + "step": 2949 + }, + { + "epoch": 0.38, + "grad_norm": 1.308585524559021, + "learning_rate": 7.148028251630144e-06, + "loss": 0.5977, + "step": 2950 + }, + { + "epoch": 0.38, + "grad_norm": 1.1514670848846436, + "learning_rate": 7.146154281815213e-06, + "loss": 0.6166, + "step": 2951 + }, + { + "epoch": 0.38, + "grad_norm": 1.0874031782150269, + "learning_rate": 7.144279942368829e-06, + "loss": 0.6517, + "step": 2952 + }, + { + "epoch": 0.38, + "grad_norm": 1.1332695484161377, + "learning_rate": 7.1424052336138094e-06, + "loss": 0.5677, + "step": 2953 + }, + { + "epoch": 0.38, + "grad_norm": 1.1511764526367188, + "learning_rate": 7.140530155873033e-06, + "loss": 0.5318, + "step": 2954 + }, + { + "epoch": 0.38, + "grad_norm": 1.207506537437439, + "learning_rate": 7.138654709469446e-06, + "loss": 0.6273, + "step": 2955 + }, + { + "epoch": 0.38, + "grad_norm": 1.4010735750198364, + "learning_rate": 7.136778894726055e-06, + "loss": 0.6545, + "step": 2956 + }, + { + "epoch": 0.38, + "grad_norm": 1.8122620582580566, + "learning_rate": 7.134902711965932e-06, + "loss": 0.6774, + "step": 2957 + }, + { + "epoch": 0.38, + "grad_norm": 1.2734031677246094, + "learning_rate": 7.133026161512209e-06, + "loss": 0.5889, + "step": 2958 + }, + { + "epoch": 0.38, + "grad_norm": 1.1605252027511597, + "learning_rate": 7.131149243688086e-06, + "loss": 0.7358, + "step": 2959 + }, + { + "epoch": 0.38, + "grad_norm": 1.0211987495422363, + "learning_rate": 7.1292719588168225e-06, + "loss": 0.5536, + "step": 2960 + }, + { + "epoch": 0.38, + "grad_norm": 1.1764460802078247, + "learning_rate": 7.127394307221743e-06, + "loss": 0.6021, + "step": 2961 + }, + { + "epoch": 0.38, + "grad_norm": 1.449872612953186, + "learning_rate": 7.125516289226236e-06, + "loss": 0.6723, + "step": 2962 + }, + { + "epoch": 0.38, + "grad_norm": 1.2078107595443726, + "learning_rate": 7.123637905153749e-06, + "loss": 0.5864, + "step": 2963 + }, + { + "epoch": 0.38, + "grad_norm": 1.082673192024231, + "learning_rate": 7.121759155327799e-06, + "loss": 0.6392, + "step": 2964 + }, + { + "epoch": 0.38, + "grad_norm": 2.1781044006347656, + "learning_rate": 7.11988004007196e-06, + "loss": 0.645, + "step": 2965 + }, + { + "epoch": 0.38, + "grad_norm": 1.0570480823516846, + "learning_rate": 7.118000559709872e-06, + "loss": 0.56, + "step": 2966 + }, + { + "epoch": 0.38, + "grad_norm": 1.205815076828003, + "learning_rate": 7.1161207145652385e-06, + "loss": 0.5751, + "step": 2967 + }, + { + "epoch": 0.38, + "grad_norm": 3.2318918704986572, + "learning_rate": 7.114240504961824e-06, + "loss": 0.6159, + "step": 2968 + }, + { + "epoch": 0.38, + "grad_norm": 1.0701419115066528, + "learning_rate": 7.112359931223456e-06, + "loss": 0.6113, + "step": 2969 + }, + { + "epoch": 0.38, + "grad_norm": 1.3909010887145996, + "learning_rate": 7.110478993674023e-06, + "loss": 0.6169, + "step": 2970 + }, + { + "epoch": 0.38, + "grad_norm": 1.2270265817642212, + "learning_rate": 7.108597692637483e-06, + "loss": 0.6437, + "step": 2971 + }, + { + "epoch": 0.38, + "grad_norm": 1.7526935338974, + "learning_rate": 7.106716028437848e-06, + "loss": 0.5907, + "step": 2972 + }, + { + "epoch": 0.38, + "grad_norm": 1.3577011823654175, + "learning_rate": 7.104834001399198e-06, + "loss": 0.6427, + "step": 2973 + }, + { + "epoch": 0.38, + "grad_norm": 1.416495680809021, + "learning_rate": 7.102951611845675e-06, + "loss": 0.6881, + "step": 2974 + }, + { + "epoch": 0.38, + "grad_norm": 1.134766936302185, + "learning_rate": 7.1010688601014786e-06, + "loss": 0.6554, + "step": 2975 + }, + { + "epoch": 0.38, + "grad_norm": 1.2442423105239868, + "learning_rate": 7.099185746490878e-06, + "loss": 0.5727, + "step": 2976 + }, + { + "epoch": 0.38, + "grad_norm": 1.257924199104309, + "learning_rate": 7.0973022713382e-06, + "loss": 0.6091, + "step": 2977 + }, + { + "epoch": 0.38, + "grad_norm": 1.4289751052856445, + "learning_rate": 7.0954184349678355e-06, + "loss": 0.6965, + "step": 2978 + }, + { + "epoch": 0.38, + "grad_norm": 1.3827462196350098, + "learning_rate": 7.0935342377042346e-06, + "loss": 0.7763, + "step": 2979 + }, + { + "epoch": 0.38, + "grad_norm": 1.0459548234939575, + "learning_rate": 7.091649679871915e-06, + "loss": 0.618, + "step": 2980 + }, + { + "epoch": 0.38, + "grad_norm": 1.6075551509857178, + "learning_rate": 7.0897647617954536e-06, + "loss": 0.5657, + "step": 2981 + }, + { + "epoch": 0.38, + "grad_norm": 1.1650707721710205, + "learning_rate": 7.087879483799487e-06, + "loss": 0.746, + "step": 2982 + }, + { + "epoch": 0.38, + "grad_norm": 1.0505053997039795, + "learning_rate": 7.085993846208718e-06, + "loss": 0.6344, + "step": 2983 + }, + { + "epoch": 0.38, + "grad_norm": 1.2678617238998413, + "learning_rate": 7.0841078493479066e-06, + "loss": 0.6389, + "step": 2984 + }, + { + "epoch": 0.38, + "grad_norm": 1.1751099824905396, + "learning_rate": 7.082221493541881e-06, + "loss": 0.5815, + "step": 2985 + }, + { + "epoch": 0.38, + "grad_norm": 1.1204549074172974, + "learning_rate": 7.080334779115525e-06, + "loss": 0.6307, + "step": 2986 + }, + { + "epoch": 0.38, + "grad_norm": 1.218030571937561, + "learning_rate": 7.078447706393788e-06, + "loss": 0.6361, + "step": 2987 + }, + { + "epoch": 0.38, + "grad_norm": 1.160930871963501, + "learning_rate": 7.07656027570168e-06, + "loss": 0.6299, + "step": 2988 + }, + { + "epoch": 0.38, + "grad_norm": 1.3285592794418335, + "learning_rate": 7.07467248736427e-06, + "loss": 0.6738, + "step": 2989 + }, + { + "epoch": 0.38, + "grad_norm": 1.2137503623962402, + "learning_rate": 7.072784341706696e-06, + "loss": 0.6647, + "step": 2990 + }, + { + "epoch": 0.38, + "grad_norm": 1.0857856273651123, + "learning_rate": 7.0708958390541485e-06, + "loss": 0.5601, + "step": 2991 + }, + { + "epoch": 0.38, + "grad_norm": 1.3946149349212646, + "learning_rate": 7.069006979731885e-06, + "loss": 0.6107, + "step": 2992 + }, + { + "epoch": 0.38, + "grad_norm": 1.3853344917297363, + "learning_rate": 7.067117764065226e-06, + "loss": 0.7235, + "step": 2993 + }, + { + "epoch": 0.38, + "grad_norm": 1.084587574005127, + "learning_rate": 7.065228192379545e-06, + "loss": 0.5808, + "step": 2994 + }, + { + "epoch": 0.38, + "grad_norm": 1.4154893159866333, + "learning_rate": 7.0633382650002854e-06, + "loss": 0.7262, + "step": 2995 + }, + { + "epoch": 0.38, + "grad_norm": 1.2447575330734253, + "learning_rate": 7.061447982252949e-06, + "loss": 0.6407, + "step": 2996 + }, + { + "epoch": 0.38, + "grad_norm": 1.1402736902236938, + "learning_rate": 7.0595573444631e-06, + "loss": 0.5344, + "step": 2997 + }, + { + "epoch": 0.38, + "grad_norm": 1.4848835468292236, + "learning_rate": 7.0576663519563584e-06, + "loss": 0.6223, + "step": 2998 + }, + { + "epoch": 0.38, + "grad_norm": 1.1726362705230713, + "learning_rate": 7.0557750050584115e-06, + "loss": 0.6763, + "step": 2999 + }, + { + "epoch": 0.38, + "grad_norm": 1.2380672693252563, + "learning_rate": 7.0538833040950065e-06, + "loss": 0.6007, + "step": 3000 + }, + { + "epoch": 0.38, + "grad_norm": 1.149147629737854, + "learning_rate": 7.051991249391947e-06, + "loss": 0.5811, + "step": 3001 + }, + { + "epoch": 0.38, + "grad_norm": 1.3859102725982666, + "learning_rate": 7.0500988412751044e-06, + "loss": 0.5532, + "step": 3002 + }, + { + "epoch": 0.38, + "grad_norm": 1.054567813873291, + "learning_rate": 7.048206080070407e-06, + "loss": 0.6674, + "step": 3003 + }, + { + "epoch": 0.38, + "grad_norm": 1.1755194664001465, + "learning_rate": 7.046312966103843e-06, + "loss": 0.6072, + "step": 3004 + }, + { + "epoch": 0.38, + "grad_norm": 1.3620301485061646, + "learning_rate": 7.044419499701462e-06, + "loss": 0.6302, + "step": 3005 + }, + { + "epoch": 0.39, + "grad_norm": 1.4070069789886475, + "learning_rate": 7.042525681189377e-06, + "loss": 0.6966, + "step": 3006 + }, + { + "epoch": 0.39, + "grad_norm": 1.1201298236846924, + "learning_rate": 7.0406315108937605e-06, + "loss": 0.646, + "step": 3007 + }, + { + "epoch": 0.39, + "grad_norm": 1.182099461555481, + "learning_rate": 7.038736989140843e-06, + "loss": 0.6491, + "step": 3008 + }, + { + "epoch": 0.39, + "grad_norm": 1.2445439100265503, + "learning_rate": 7.036842116256919e-06, + "loss": 0.6441, + "step": 3009 + }, + { + "epoch": 0.39, + "grad_norm": 0.9912326335906982, + "learning_rate": 7.034946892568339e-06, + "loss": 0.6075, + "step": 3010 + }, + { + "epoch": 0.39, + "grad_norm": 1.477307915687561, + "learning_rate": 7.03305131840152e-06, + "loss": 0.5633, + "step": 3011 + }, + { + "epoch": 0.39, + "grad_norm": 1.2548702955245972, + "learning_rate": 7.031155394082935e-06, + "loss": 0.6102, + "step": 3012 + }, + { + "epoch": 0.39, + "grad_norm": 1.1897332668304443, + "learning_rate": 7.029259119939118e-06, + "loss": 0.6613, + "step": 3013 + }, + { + "epoch": 0.39, + "grad_norm": 1.2398009300231934, + "learning_rate": 7.027362496296662e-06, + "loss": 0.6149, + "step": 3014 + }, + { + "epoch": 0.39, + "grad_norm": 1.3539067506790161, + "learning_rate": 7.025465523482225e-06, + "loss": 0.5495, + "step": 3015 + }, + { + "epoch": 0.39, + "grad_norm": 1.1485373973846436, + "learning_rate": 7.023568201822519e-06, + "loss": 0.5772, + "step": 3016 + }, + { + "epoch": 0.39, + "grad_norm": 1.3308528661727905, + "learning_rate": 7.021670531644324e-06, + "loss": 0.5792, + "step": 3017 + }, + { + "epoch": 0.39, + "grad_norm": 1.2606019973754883, + "learning_rate": 7.01977251327447e-06, + "loss": 0.6744, + "step": 3018 + }, + { + "epoch": 0.39, + "grad_norm": 1.0969258546829224, + "learning_rate": 7.017874147039855e-06, + "loss": 0.6242, + "step": 3019 + }, + { + "epoch": 0.39, + "grad_norm": 1.1134408712387085, + "learning_rate": 7.0159754332674316e-06, + "loss": 0.6511, + "step": 3020 + }, + { + "epoch": 0.39, + "grad_norm": 1.1563353538513184, + "learning_rate": 7.014076372284217e-06, + "loss": 0.712, + "step": 3021 + }, + { + "epoch": 0.39, + "grad_norm": 1.1367099285125732, + "learning_rate": 7.012176964417284e-06, + "loss": 0.6286, + "step": 3022 + }, + { + "epoch": 0.39, + "grad_norm": 1.443542718887329, + "learning_rate": 7.010277209993769e-06, + "loss": 0.6477, + "step": 3023 + }, + { + "epoch": 0.39, + "grad_norm": 1.0937227010726929, + "learning_rate": 7.008377109340865e-06, + "loss": 0.6286, + "step": 3024 + }, + { + "epoch": 0.39, + "grad_norm": 1.4116562604904175, + "learning_rate": 7.006476662785825e-06, + "loss": 0.6758, + "step": 3025 + }, + { + "epoch": 0.39, + "grad_norm": 0.9480895400047302, + "learning_rate": 7.004575870655963e-06, + "loss": 0.5669, + "step": 3026 + }, + { + "epoch": 0.39, + "grad_norm": 1.328150987625122, + "learning_rate": 7.002674733278652e-06, + "loss": 0.6269, + "step": 3027 + }, + { + "epoch": 0.39, + "grad_norm": 1.224358320236206, + "learning_rate": 7.000773250981325e-06, + "loss": 0.626, + "step": 3028 + }, + { + "epoch": 0.39, + "grad_norm": 1.514603853225708, + "learning_rate": 6.998871424091472e-06, + "loss": 0.6341, + "step": 3029 + }, + { + "epoch": 0.39, + "grad_norm": 1.0662037134170532, + "learning_rate": 6.996969252936645e-06, + "loss": 0.5783, + "step": 3030 + }, + { + "epoch": 0.39, + "grad_norm": 1.2578132152557373, + "learning_rate": 6.995066737844454e-06, + "loss": 0.6042, + "step": 3031 + }, + { + "epoch": 0.39, + "grad_norm": 1.1706691980361938, + "learning_rate": 6.993163879142567e-06, + "loss": 0.6136, + "step": 3032 + }, + { + "epoch": 0.39, + "grad_norm": 0.9942347407341003, + "learning_rate": 6.991260677158717e-06, + "loss": 0.6262, + "step": 3033 + }, + { + "epoch": 0.39, + "grad_norm": 1.1480199098587036, + "learning_rate": 6.989357132220686e-06, + "loss": 0.5928, + "step": 3034 + }, + { + "epoch": 0.39, + "grad_norm": 1.1092981100082397, + "learning_rate": 6.987453244656328e-06, + "loss": 0.6324, + "step": 3035 + }, + { + "epoch": 0.39, + "grad_norm": 1.462157130241394, + "learning_rate": 6.985549014793542e-06, + "loss": 0.6114, + "step": 3036 + }, + { + "epoch": 0.39, + "grad_norm": 0.9729678630828857, + "learning_rate": 6.983644442960299e-06, + "loss": 0.644, + "step": 3037 + }, + { + "epoch": 0.39, + "grad_norm": 1.6944612264633179, + "learning_rate": 6.9817395294846165e-06, + "loss": 0.5726, + "step": 3038 + }, + { + "epoch": 0.39, + "grad_norm": 1.0340354442596436, + "learning_rate": 6.979834274694583e-06, + "loss": 0.6147, + "step": 3039 + }, + { + "epoch": 0.39, + "grad_norm": 1.3940993547439575, + "learning_rate": 6.977928678918335e-06, + "loss": 0.6238, + "step": 3040 + }, + { + "epoch": 0.39, + "grad_norm": 1.217641830444336, + "learning_rate": 6.976022742484076e-06, + "loss": 0.6713, + "step": 3041 + }, + { + "epoch": 0.39, + "grad_norm": 5.02081298828125, + "learning_rate": 6.974116465720064e-06, + "loss": 0.6449, + "step": 3042 + }, + { + "epoch": 0.39, + "grad_norm": 1.6640965938568115, + "learning_rate": 6.972209848954617e-06, + "loss": 0.6568, + "step": 3043 + }, + { + "epoch": 0.39, + "grad_norm": 1.213199496269226, + "learning_rate": 6.97030289251611e-06, + "loss": 0.5438, + "step": 3044 + }, + { + "epoch": 0.39, + "grad_norm": 1.1289682388305664, + "learning_rate": 6.968395596732977e-06, + "loss": 0.6994, + "step": 3045 + }, + { + "epoch": 0.39, + "grad_norm": 1.1272958517074585, + "learning_rate": 6.9664879619337115e-06, + "loss": 0.5982, + "step": 3046 + }, + { + "epoch": 0.39, + "grad_norm": 1.7126679420471191, + "learning_rate": 6.964579988446866e-06, + "loss": 0.5872, + "step": 3047 + }, + { + "epoch": 0.39, + "grad_norm": 1.227146863937378, + "learning_rate": 6.962671676601048e-06, + "loss": 0.6131, + "step": 3048 + }, + { + "epoch": 0.39, + "grad_norm": 1.314324140548706, + "learning_rate": 6.960763026724926e-06, + "loss": 0.584, + "step": 3049 + }, + { + "epoch": 0.39, + "grad_norm": 1.2320308685302734, + "learning_rate": 6.958854039147227e-06, + "loss": 0.5748, + "step": 3050 + }, + { + "epoch": 0.39, + "grad_norm": 1.2381142377853394, + "learning_rate": 6.956944714196735e-06, + "loss": 0.6293, + "step": 3051 + }, + { + "epoch": 0.39, + "grad_norm": 1.3332390785217285, + "learning_rate": 6.9550350522022935e-06, + "loss": 0.5833, + "step": 3052 + }, + { + "epoch": 0.39, + "grad_norm": 1.1489558219909668, + "learning_rate": 6.953125053492801e-06, + "loss": 0.5628, + "step": 3053 + }, + { + "epoch": 0.39, + "grad_norm": 1.03483247756958, + "learning_rate": 6.951214718397217e-06, + "loss": 0.5724, + "step": 3054 + }, + { + "epoch": 0.39, + "grad_norm": 1.5205141305923462, + "learning_rate": 6.9493040472445575e-06, + "loss": 0.6287, + "step": 3055 + }, + { + "epoch": 0.39, + "grad_norm": 1.2520228624343872, + "learning_rate": 6.947393040363897e-06, + "loss": 0.6165, + "step": 3056 + }, + { + "epoch": 0.39, + "grad_norm": 1.1673003435134888, + "learning_rate": 6.945481698084366e-06, + "loss": 0.6758, + "step": 3057 + }, + { + "epoch": 0.39, + "grad_norm": 1.1318930387496948, + "learning_rate": 6.943570020735158e-06, + "loss": 0.6002, + "step": 3058 + }, + { + "epoch": 0.39, + "grad_norm": 1.2879729270935059, + "learning_rate": 6.941658008645518e-06, + "loss": 0.6208, + "step": 3059 + }, + { + "epoch": 0.39, + "grad_norm": 1.3986260890960693, + "learning_rate": 6.939745662144751e-06, + "loss": 0.5997, + "step": 3060 + }, + { + "epoch": 0.39, + "grad_norm": 1.7398829460144043, + "learning_rate": 6.9378329815622215e-06, + "loss": 0.6191, + "step": 3061 + }, + { + "epoch": 0.39, + "grad_norm": 1.0883538722991943, + "learning_rate": 6.935919967227348e-06, + "loss": 0.6555, + "step": 3062 + }, + { + "epoch": 0.39, + "grad_norm": 1.441008448600769, + "learning_rate": 6.93400661946961e-06, + "loss": 0.6771, + "step": 3063 + }, + { + "epoch": 0.39, + "grad_norm": 1.193451166152954, + "learning_rate": 6.932092938618541e-06, + "loss": 0.6785, + "step": 3064 + }, + { + "epoch": 0.39, + "grad_norm": 1.0834167003631592, + "learning_rate": 6.930178925003735e-06, + "loss": 0.6142, + "step": 3065 + }, + { + "epoch": 0.39, + "grad_norm": 1.0183688402175903, + "learning_rate": 6.928264578954841e-06, + "loss": 0.6609, + "step": 3066 + }, + { + "epoch": 0.39, + "grad_norm": 1.2965633869171143, + "learning_rate": 6.926349900801568e-06, + "loss": 0.6554, + "step": 3067 + }, + { + "epoch": 0.39, + "grad_norm": 1.4456558227539062, + "learning_rate": 6.924434890873677e-06, + "loss": 0.6321, + "step": 3068 + }, + { + "epoch": 0.39, + "grad_norm": 1.5659126043319702, + "learning_rate": 6.922519549500994e-06, + "loss": 0.6931, + "step": 3069 + }, + { + "epoch": 0.39, + "grad_norm": 1.1102631092071533, + "learning_rate": 6.920603877013393e-06, + "loss": 0.6349, + "step": 3070 + }, + { + "epoch": 0.39, + "grad_norm": 1.2046291828155518, + "learning_rate": 6.918687873740815e-06, + "loss": 0.6647, + "step": 3071 + }, + { + "epoch": 0.39, + "grad_norm": 1.7100063562393188, + "learning_rate": 6.916771540013246e-06, + "loss": 0.6549, + "step": 3072 + }, + { + "epoch": 0.39, + "grad_norm": 1.1494594812393188, + "learning_rate": 6.914854876160741e-06, + "loss": 0.619, + "step": 3073 + }, + { + "epoch": 0.39, + "grad_norm": 1.156104564666748, + "learning_rate": 6.912937882513404e-06, + "loss": 0.7542, + "step": 3074 + }, + { + "epoch": 0.39, + "grad_norm": 1.092430591583252, + "learning_rate": 6.911020559401399e-06, + "loss": 0.5925, + "step": 3075 + }, + { + "epoch": 0.39, + "grad_norm": 1.2527555227279663, + "learning_rate": 6.909102907154946e-06, + "loss": 0.6515, + "step": 3076 + }, + { + "epoch": 0.39, + "grad_norm": 1.0143706798553467, + "learning_rate": 6.90718492610432e-06, + "loss": 0.5752, + "step": 3077 + }, + { + "epoch": 0.39, + "grad_norm": 1.188001036643982, + "learning_rate": 6.905266616579857e-06, + "loss": 0.5966, + "step": 3078 + }, + { + "epoch": 0.39, + "grad_norm": 1.197407841682434, + "learning_rate": 6.903347978911944e-06, + "loss": 0.5806, + "step": 3079 + }, + { + "epoch": 0.39, + "grad_norm": 1.2781100273132324, + "learning_rate": 6.9014290134310294e-06, + "loss": 0.6362, + "step": 3080 + }, + { + "epoch": 0.39, + "grad_norm": 1.1337027549743652, + "learning_rate": 6.899509720467614e-06, + "loss": 0.6279, + "step": 3081 + }, + { + "epoch": 0.39, + "grad_norm": 1.2396838665008545, + "learning_rate": 6.897590100352261e-06, + "loss": 0.5464, + "step": 3082 + }, + { + "epoch": 0.39, + "grad_norm": 0.947368860244751, + "learning_rate": 6.89567015341558e-06, + "loss": 0.6205, + "step": 3083 + }, + { + "epoch": 0.4, + "grad_norm": 1.04353928565979, + "learning_rate": 6.893749879988248e-06, + "loss": 0.6318, + "step": 3084 + }, + { + "epoch": 0.4, + "grad_norm": 1.1901612281799316, + "learning_rate": 6.89182928040099e-06, + "loss": 0.5634, + "step": 3085 + }, + { + "epoch": 0.4, + "grad_norm": 1.215627670288086, + "learning_rate": 6.8899083549845914e-06, + "loss": 0.6177, + "step": 3086 + }, + { + "epoch": 0.4, + "grad_norm": 1.5702704191207886, + "learning_rate": 6.8879871040698935e-06, + "loss": 0.6184, + "step": 3087 + }, + { + "epoch": 0.4, + "grad_norm": 1.1875890493392944, + "learning_rate": 6.886065527987791e-06, + "loss": 0.5802, + "step": 3088 + }, + { + "epoch": 0.4, + "grad_norm": 1.155051350593567, + "learning_rate": 6.884143627069236e-06, + "loss": 0.6456, + "step": 3089 + }, + { + "epoch": 0.4, + "grad_norm": 1.4234286546707153, + "learning_rate": 6.882221401645239e-06, + "loss": 0.6865, + "step": 3090 + }, + { + "epoch": 0.4, + "grad_norm": 1.3715044260025024, + "learning_rate": 6.880298852046863e-06, + "loss": 0.5297, + "step": 3091 + }, + { + "epoch": 0.4, + "grad_norm": 1.273527979850769, + "learning_rate": 6.878375978605227e-06, + "loss": 0.5526, + "step": 3092 + }, + { + "epoch": 0.4, + "grad_norm": 1.063097596168518, + "learning_rate": 6.8764527816515105e-06, + "loss": 0.551, + "step": 3093 + }, + { + "epoch": 0.4, + "grad_norm": 1.2897167205810547, + "learning_rate": 6.874529261516941e-06, + "loss": 0.5874, + "step": 3094 + }, + { + "epoch": 0.4, + "grad_norm": 1.6665688753128052, + "learning_rate": 6.872605418532808e-06, + "loss": 0.5858, + "step": 3095 + }, + { + "epoch": 0.4, + "grad_norm": 1.0794209241867065, + "learning_rate": 6.870681253030453e-06, + "loss": 0.6313, + "step": 3096 + }, + { + "epoch": 0.4, + "grad_norm": 1.3062844276428223, + "learning_rate": 6.868756765341278e-06, + "loss": 0.686, + "step": 3097 + }, + { + "epoch": 0.4, + "grad_norm": 1.1371840238571167, + "learning_rate": 6.866831955796731e-06, + "loss": 0.525, + "step": 3098 + }, + { + "epoch": 0.4, + "grad_norm": 1.2220420837402344, + "learning_rate": 6.864906824728326e-06, + "loss": 0.6384, + "step": 3099 + }, + { + "epoch": 0.4, + "grad_norm": 4.069206237792969, + "learning_rate": 6.862981372467626e-06, + "loss": 0.6486, + "step": 3100 + }, + { + "epoch": 0.4, + "grad_norm": 1.6746448278427124, + "learning_rate": 6.861055599346249e-06, + "loss": 0.5713, + "step": 3101 + }, + { + "epoch": 0.4, + "grad_norm": 1.100169062614441, + "learning_rate": 6.859129505695874e-06, + "loss": 0.5867, + "step": 3102 + }, + { + "epoch": 0.4, + "grad_norm": 1.2251650094985962, + "learning_rate": 6.85720309184823e-06, + "loss": 0.61, + "step": 3103 + }, + { + "epoch": 0.4, + "grad_norm": 1.1724867820739746, + "learning_rate": 6.855276358135102e-06, + "loss": 0.6248, + "step": 3104 + }, + { + "epoch": 0.4, + "grad_norm": 1.0926048755645752, + "learning_rate": 6.853349304888331e-06, + "loss": 0.5075, + "step": 3105 + }, + { + "epoch": 0.4, + "grad_norm": 1.4889264106750488, + "learning_rate": 6.851421932439815e-06, + "loss": 0.5743, + "step": 3106 + }, + { + "epoch": 0.4, + "grad_norm": 1.1425611972808838, + "learning_rate": 6.8494942411215e-06, + "loss": 0.68, + "step": 3107 + }, + { + "epoch": 0.4, + "grad_norm": 1.138596534729004, + "learning_rate": 6.847566231265397e-06, + "loss": 0.6524, + "step": 3108 + }, + { + "epoch": 0.4, + "grad_norm": 1.3879531621932983, + "learning_rate": 6.845637903203562e-06, + "loss": 0.6623, + "step": 3109 + }, + { + "epoch": 0.4, + "grad_norm": 1.7340507507324219, + "learning_rate": 6.843709257268112e-06, + "loss": 0.6271, + "step": 3110 + }, + { + "epoch": 0.4, + "grad_norm": 1.5906254053115845, + "learning_rate": 6.841780293791218e-06, + "loss": 0.5912, + "step": 3111 + }, + { + "epoch": 0.4, + "grad_norm": 1.439652681350708, + "learning_rate": 6.839851013105103e-06, + "loss": 0.6353, + "step": 3112 + }, + { + "epoch": 0.4, + "grad_norm": 1.2272002696990967, + "learning_rate": 6.837921415542048e-06, + "loss": 0.6631, + "step": 3113 + }, + { + "epoch": 0.4, + "grad_norm": 1.2197409868240356, + "learning_rate": 6.835991501434387e-06, + "loss": 0.6065, + "step": 3114 + }, + { + "epoch": 0.4, + "grad_norm": 1.2662650346755981, + "learning_rate": 6.834061271114507e-06, + "loss": 0.6074, + "step": 3115 + }, + { + "epoch": 0.4, + "grad_norm": 1.422365665435791, + "learning_rate": 6.832130724914852e-06, + "loss": 0.676, + "step": 3116 + }, + { + "epoch": 0.4, + "grad_norm": 1.490148663520813, + "learning_rate": 6.830199863167919e-06, + "loss": 0.5937, + "step": 3117 + }, + { + "epoch": 0.4, + "grad_norm": 1.249938726425171, + "learning_rate": 6.828268686206259e-06, + "loss": 0.6307, + "step": 3118 + }, + { + "epoch": 0.4, + "grad_norm": 1.534359335899353, + "learning_rate": 6.82633719436248e-06, + "loss": 0.6081, + "step": 3119 + }, + { + "epoch": 0.4, + "grad_norm": 1.2044472694396973, + "learning_rate": 6.82440538796924e-06, + "loss": 0.5735, + "step": 3120 + }, + { + "epoch": 0.4, + "grad_norm": 1.0795295238494873, + "learning_rate": 6.8224732673592555e-06, + "loss": 0.5773, + "step": 3121 + }, + { + "epoch": 0.4, + "grad_norm": 1.2564455270767212, + "learning_rate": 6.820540832865293e-06, + "loss": 0.592, + "step": 3122 + }, + { + "epoch": 0.4, + "grad_norm": 1.1027977466583252, + "learning_rate": 6.818608084820176e-06, + "loss": 0.6072, + "step": 3123 + }, + { + "epoch": 0.4, + "grad_norm": 1.121226191520691, + "learning_rate": 6.816675023556781e-06, + "loss": 0.6798, + "step": 3124 + }, + { + "epoch": 0.4, + "grad_norm": 1.1786526441574097, + "learning_rate": 6.814741649408039e-06, + "loss": 0.6565, + "step": 3125 + }, + { + "epoch": 0.4, + "grad_norm": 1.2946691513061523, + "learning_rate": 6.812807962706933e-06, + "loss": 0.5838, + "step": 3126 + }, + { + "epoch": 0.4, + "grad_norm": 1.294685959815979, + "learning_rate": 6.810873963786501e-06, + "loss": 0.5232, + "step": 3127 + }, + { + "epoch": 0.4, + "grad_norm": 2.5822598934173584, + "learning_rate": 6.808939652979839e-06, + "loss": 0.6297, + "step": 3128 + }, + { + "epoch": 0.4, + "grad_norm": 1.2533141374588013, + "learning_rate": 6.807005030620088e-06, + "loss": 0.6384, + "step": 3129 + }, + { + "epoch": 0.4, + "grad_norm": 1.3660823106765747, + "learning_rate": 6.805070097040451e-06, + "loss": 0.6749, + "step": 3130 + }, + { + "epoch": 0.4, + "grad_norm": 1.3122096061706543, + "learning_rate": 6.803134852574177e-06, + "loss": 0.6865, + "step": 3131 + }, + { + "epoch": 0.4, + "grad_norm": 1.1023629903793335, + "learning_rate": 6.801199297554577e-06, + "loss": 0.6321, + "step": 3132 + }, + { + "epoch": 0.4, + "grad_norm": 1.2410534620285034, + "learning_rate": 6.799263432315006e-06, + "loss": 0.6166, + "step": 3133 + }, + { + "epoch": 0.4, + "grad_norm": 1.456033706665039, + "learning_rate": 6.797327257188882e-06, + "loss": 0.6063, + "step": 3134 + }, + { + "epoch": 0.4, + "grad_norm": 1.1916273832321167, + "learning_rate": 6.79539077250967e-06, + "loss": 0.6006, + "step": 3135 + }, + { + "epoch": 0.4, + "grad_norm": 1.2367602586746216, + "learning_rate": 6.793453978610889e-06, + "loss": 0.6278, + "step": 3136 + }, + { + "epoch": 0.4, + "grad_norm": 1.3319041728973389, + "learning_rate": 6.791516875826115e-06, + "loss": 0.6671, + "step": 3137 + }, + { + "epoch": 0.4, + "grad_norm": 1.1352344751358032, + "learning_rate": 6.789579464488971e-06, + "loss": 0.6093, + "step": 3138 + }, + { + "epoch": 0.4, + "grad_norm": 1.1920417547225952, + "learning_rate": 6.787641744933141e-06, + "loss": 0.6582, + "step": 3139 + }, + { + "epoch": 0.4, + "grad_norm": 1.1601824760437012, + "learning_rate": 6.785703717492355e-06, + "loss": 0.5819, + "step": 3140 + }, + { + "epoch": 0.4, + "grad_norm": 1.0924068689346313, + "learning_rate": 6.783765382500399e-06, + "loss": 0.6438, + "step": 3141 + }, + { + "epoch": 0.4, + "grad_norm": 1.2986295223236084, + "learning_rate": 6.781826740291112e-06, + "loss": 0.6269, + "step": 3142 + }, + { + "epoch": 0.4, + "grad_norm": 0.9878475666046143, + "learning_rate": 6.7798877911983865e-06, + "loss": 0.5701, + "step": 3143 + }, + { + "epoch": 0.4, + "grad_norm": 1.0973353385925293, + "learning_rate": 6.7779485355561656e-06, + "loss": 0.5951, + "step": 3144 + }, + { + "epoch": 0.4, + "grad_norm": 1.1205912828445435, + "learning_rate": 6.776008973698449e-06, + "loss": 0.6099, + "step": 3145 + }, + { + "epoch": 0.4, + "grad_norm": 1.1063035726547241, + "learning_rate": 6.7740691059592844e-06, + "loss": 0.5666, + "step": 3146 + }, + { + "epoch": 0.4, + "grad_norm": 1.3901326656341553, + "learning_rate": 6.7721289326727765e-06, + "loss": 0.6367, + "step": 3147 + }, + { + "epoch": 0.4, + "grad_norm": 1.5512032508850098, + "learning_rate": 6.77018845417308e-06, + "loss": 0.6069, + "step": 3148 + }, + { + "epoch": 0.4, + "grad_norm": 1.5061674118041992, + "learning_rate": 6.768247670794401e-06, + "loss": 0.6052, + "step": 3149 + }, + { + "epoch": 0.4, + "grad_norm": 1.2669321298599243, + "learning_rate": 6.766306582871004e-06, + "loss": 0.593, + "step": 3150 + }, + { + "epoch": 0.4, + "grad_norm": 1.0490036010742188, + "learning_rate": 6.764365190737197e-06, + "loss": 0.58, + "step": 3151 + }, + { + "epoch": 0.4, + "grad_norm": 1.3043758869171143, + "learning_rate": 6.7624234947273495e-06, + "loss": 0.6879, + "step": 3152 + }, + { + "epoch": 0.4, + "grad_norm": 1.293900728225708, + "learning_rate": 6.7604814951758764e-06, + "loss": 0.6152, + "step": 3153 + }, + { + "epoch": 0.4, + "grad_norm": 2.1668155193328857, + "learning_rate": 6.758539192417251e-06, + "loss": 0.5602, + "step": 3154 + }, + { + "epoch": 0.4, + "grad_norm": 1.1483577489852905, + "learning_rate": 6.7565965867859914e-06, + "loss": 0.6185, + "step": 3155 + }, + { + "epoch": 0.4, + "grad_norm": 1.1590529680252075, + "learning_rate": 6.754653678616676e-06, + "loss": 0.6432, + "step": 3156 + }, + { + "epoch": 0.4, + "grad_norm": 1.140228509902954, + "learning_rate": 6.752710468243927e-06, + "loss": 0.6195, + "step": 3157 + }, + { + "epoch": 0.4, + "grad_norm": 1.4454078674316406, + "learning_rate": 6.7507669560024265e-06, + "loss": 0.6264, + "step": 3158 + }, + { + "epoch": 0.4, + "grad_norm": 1.2619370222091675, + "learning_rate": 6.748823142226902e-06, + "loss": 0.7389, + "step": 3159 + }, + { + "epoch": 0.4, + "grad_norm": 1.2213263511657715, + "learning_rate": 6.746879027252138e-06, + "loss": 0.5855, + "step": 3160 + }, + { + "epoch": 0.4, + "grad_norm": 1.2946794033050537, + "learning_rate": 6.74493461141297e-06, + "loss": 0.5703, + "step": 3161 + }, + { + "epoch": 0.41, + "grad_norm": 1.256300926208496, + "learning_rate": 6.7429898950442794e-06, + "loss": 0.6776, + "step": 3162 + }, + { + "epoch": 0.41, + "grad_norm": 1.374864101409912, + "learning_rate": 6.741044878481009e-06, + "loss": 0.6003, + "step": 3163 + }, + { + "epoch": 0.41, + "grad_norm": 1.5750633478164673, + "learning_rate": 6.739099562058146e-06, + "loss": 0.7092, + "step": 3164 + }, + { + "epoch": 0.41, + "grad_norm": 2.048159122467041, + "learning_rate": 6.737153946110732e-06, + "loss": 0.6243, + "step": 3165 + }, + { + "epoch": 0.41, + "grad_norm": 2.874873399734497, + "learning_rate": 6.735208030973858e-06, + "loss": 0.5676, + "step": 3166 + }, + { + "epoch": 0.41, + "grad_norm": 1.1411672830581665, + "learning_rate": 6.7332618169826725e-06, + "loss": 0.5944, + "step": 3167 + }, + { + "epoch": 0.41, + "grad_norm": 1.4558788537979126, + "learning_rate": 6.731315304472366e-06, + "loss": 0.5991, + "step": 3168 + }, + { + "epoch": 0.41, + "grad_norm": 1.4933147430419922, + "learning_rate": 6.7293684937781915e-06, + "loss": 0.6637, + "step": 3169 + }, + { + "epoch": 0.41, + "grad_norm": 1.090269923210144, + "learning_rate": 6.727421385235443e-06, + "loss": 0.597, + "step": 3170 + }, + { + "epoch": 0.41, + "grad_norm": 1.5020686388015747, + "learning_rate": 6.7254739791794735e-06, + "loss": 0.6184, + "step": 3171 + }, + { + "epoch": 0.41, + "grad_norm": 1.2008216381072998, + "learning_rate": 6.7235262759456824e-06, + "loss": 0.5001, + "step": 3172 + }, + { + "epoch": 0.41, + "grad_norm": 1.278684377670288, + "learning_rate": 6.721578275869521e-06, + "loss": 0.6604, + "step": 3173 + }, + { + "epoch": 0.41, + "grad_norm": 1.2600221633911133, + "learning_rate": 6.719629979286495e-06, + "loss": 0.6642, + "step": 3174 + }, + { + "epoch": 0.41, + "grad_norm": 1.3807469606399536, + "learning_rate": 6.717681386532158e-06, + "loss": 0.5636, + "step": 3175 + }, + { + "epoch": 0.41, + "grad_norm": 1.2727843523025513, + "learning_rate": 6.7157324979421145e-06, + "loss": 0.6672, + "step": 3176 + }, + { + "epoch": 0.41, + "grad_norm": 1.1497278213500977, + "learning_rate": 6.71378331385202e-06, + "loss": 0.7587, + "step": 3177 + }, + { + "epoch": 0.41, + "grad_norm": 1.1401848793029785, + "learning_rate": 6.711833834597587e-06, + "loss": 0.5935, + "step": 3178 + }, + { + "epoch": 0.41, + "grad_norm": 1.0336487293243408, + "learning_rate": 6.709884060514568e-06, + "loss": 0.5936, + "step": 3179 + }, + { + "epoch": 0.41, + "grad_norm": 1.3813772201538086, + "learning_rate": 6.707933991938777e-06, + "loss": 0.6915, + "step": 3180 + }, + { + "epoch": 0.41, + "grad_norm": 1.2658019065856934, + "learning_rate": 6.705983629206068e-06, + "loss": 0.5932, + "step": 3181 + }, + { + "epoch": 0.41, + "grad_norm": 1.535305142402649, + "learning_rate": 6.704032972652357e-06, + "loss": 0.6805, + "step": 3182 + }, + { + "epoch": 0.41, + "grad_norm": 1.2994247674942017, + "learning_rate": 6.7020820226136e-06, + "loss": 0.6971, + "step": 3183 + }, + { + "epoch": 0.41, + "grad_norm": 1.518620252609253, + "learning_rate": 6.700130779425812e-06, + "loss": 0.6603, + "step": 3184 + }, + { + "epoch": 0.41, + "grad_norm": 1.4807312488555908, + "learning_rate": 6.698179243425053e-06, + "loss": 0.5437, + "step": 3185 + }, + { + "epoch": 0.41, + "grad_norm": 1.1867648363113403, + "learning_rate": 6.696227414947436e-06, + "loss": 0.7545, + "step": 3186 + }, + { + "epoch": 0.41, + "grad_norm": 1.30292809009552, + "learning_rate": 6.694275294329125e-06, + "loss": 0.6676, + "step": 3187 + }, + { + "epoch": 0.41, + "grad_norm": 1.5975512266159058, + "learning_rate": 6.69232288190633e-06, + "loss": 0.6476, + "step": 3188 + }, + { + "epoch": 0.41, + "grad_norm": 1.0806468725204468, + "learning_rate": 6.690370178015318e-06, + "loss": 0.6023, + "step": 3189 + }, + { + "epoch": 0.41, + "grad_norm": 1.43095862865448, + "learning_rate": 6.688417182992399e-06, + "loss": 0.6321, + "step": 3190 + }, + { + "epoch": 0.41, + "grad_norm": 1.135010838508606, + "learning_rate": 6.686463897173942e-06, + "loss": 0.6086, + "step": 3191 + }, + { + "epoch": 0.41, + "grad_norm": 1.3971952199935913, + "learning_rate": 6.684510320896354e-06, + "loss": 0.5269, + "step": 3192 + }, + { + "epoch": 0.41, + "grad_norm": 1.2091296911239624, + "learning_rate": 6.6825564544961055e-06, + "loss": 0.6593, + "step": 3193 + }, + { + "epoch": 0.41, + "grad_norm": 1.231992483139038, + "learning_rate": 6.680602298309703e-06, + "loss": 0.6894, + "step": 3194 + }, + { + "epoch": 0.41, + "grad_norm": 1.6504241228103638, + "learning_rate": 6.678647852673717e-06, + "loss": 0.6522, + "step": 3195 + }, + { + "epoch": 0.41, + "grad_norm": 1.6003928184509277, + "learning_rate": 6.676693117924757e-06, + "loss": 0.5055, + "step": 3196 + }, + { + "epoch": 0.41, + "grad_norm": 1.3327730894088745, + "learning_rate": 6.674738094399488e-06, + "loss": 0.6239, + "step": 3197 + }, + { + "epoch": 0.41, + "grad_norm": 1.1069480180740356, + "learning_rate": 6.672782782434622e-06, + "loss": 0.5595, + "step": 3198 + }, + { + "epoch": 0.41, + "grad_norm": 1.0996955633163452, + "learning_rate": 6.670827182366922e-06, + "loss": 0.6455, + "step": 3199 + }, + { + "epoch": 0.41, + "grad_norm": 1.1967326402664185, + "learning_rate": 6.668871294533202e-06, + "loss": 0.6504, + "step": 3200 + }, + { + "epoch": 0.41, + "grad_norm": 1.4416142702102661, + "learning_rate": 6.666915119270322e-06, + "loss": 0.7059, + "step": 3201 + }, + { + "epoch": 0.41, + "grad_norm": 1.0445725917816162, + "learning_rate": 6.664958656915195e-06, + "loss": 0.5707, + "step": 3202 + }, + { + "epoch": 0.41, + "grad_norm": 1.2745569944381714, + "learning_rate": 6.663001907804778e-06, + "loss": 0.6299, + "step": 3203 + }, + { + "epoch": 0.41, + "grad_norm": 1.199661374092102, + "learning_rate": 6.661044872276086e-06, + "loss": 0.6521, + "step": 3204 + }, + { + "epoch": 0.41, + "grad_norm": 1.2170096635818481, + "learning_rate": 6.6590875506661764e-06, + "loss": 0.5463, + "step": 3205 + }, + { + "epoch": 0.41, + "grad_norm": 1.6790763139724731, + "learning_rate": 6.65712994331216e-06, + "loss": 0.6315, + "step": 3206 + }, + { + "epoch": 0.41, + "grad_norm": 1.6332316398620605, + "learning_rate": 6.655172050551191e-06, + "loss": 0.4971, + "step": 3207 + }, + { + "epoch": 0.41, + "grad_norm": 1.081221580505371, + "learning_rate": 6.653213872720481e-06, + "loss": 0.7278, + "step": 3208 + }, + { + "epoch": 0.41, + "grad_norm": 1.130863904953003, + "learning_rate": 6.651255410157282e-06, + "loss": 0.6655, + "step": 3209 + }, + { + "epoch": 0.41, + "grad_norm": 1.1781548261642456, + "learning_rate": 6.649296663198903e-06, + "loss": 0.6444, + "step": 3210 + }, + { + "epoch": 0.41, + "grad_norm": 1.1700100898742676, + "learning_rate": 6.6473376321826965e-06, + "loss": 0.6042, + "step": 3211 + }, + { + "epoch": 0.41, + "grad_norm": 1.1747479438781738, + "learning_rate": 6.645378317446066e-06, + "loss": 0.6241, + "step": 3212 + }, + { + "epoch": 0.41, + "grad_norm": 1.5794117450714111, + "learning_rate": 6.643418719326466e-06, + "loss": 0.6243, + "step": 3213 + }, + { + "epoch": 0.41, + "grad_norm": 1.0308579206466675, + "learning_rate": 6.6414588381613935e-06, + "loss": 0.6249, + "step": 3214 + }, + { + "epoch": 0.41, + "grad_norm": 1.6672332286834717, + "learning_rate": 6.6394986742884e-06, + "loss": 0.5955, + "step": 3215 + }, + { + "epoch": 0.41, + "grad_norm": 1.2605336904525757, + "learning_rate": 6.637538228045084e-06, + "loss": 0.6468, + "step": 3216 + }, + { + "epoch": 0.41, + "grad_norm": 1.1389272212982178, + "learning_rate": 6.635577499769093e-06, + "loss": 0.5899, + "step": 3217 + }, + { + "epoch": 0.41, + "grad_norm": 1.2226412296295166, + "learning_rate": 6.633616489798121e-06, + "loss": 0.5962, + "step": 3218 + }, + { + "epoch": 0.41, + "grad_norm": 1.1442811489105225, + "learning_rate": 6.631655198469915e-06, + "loss": 0.7649, + "step": 3219 + }, + { + "epoch": 0.41, + "grad_norm": 1.2384653091430664, + "learning_rate": 6.629693626122262e-06, + "loss": 0.5698, + "step": 3220 + }, + { + "epoch": 0.41, + "grad_norm": 1.148032546043396, + "learning_rate": 6.62773177309301e-06, + "loss": 0.6537, + "step": 3221 + }, + { + "epoch": 0.41, + "grad_norm": 1.6926991939544678, + "learning_rate": 6.625769639720045e-06, + "loss": 0.5875, + "step": 3222 + }, + { + "epoch": 0.41, + "grad_norm": 1.2067431211471558, + "learning_rate": 6.623807226341303e-06, + "loss": 0.6435, + "step": 3223 + }, + { + "epoch": 0.41, + "grad_norm": 1.403552532196045, + "learning_rate": 6.621844533294772e-06, + "loss": 0.6821, + "step": 3224 + }, + { + "epoch": 0.41, + "grad_norm": 1.3150187730789185, + "learning_rate": 6.619881560918485e-06, + "loss": 0.5826, + "step": 3225 + }, + { + "epoch": 0.41, + "grad_norm": 2.287637948989868, + "learning_rate": 6.6179183095505265e-06, + "loss": 0.6228, + "step": 3226 + }, + { + "epoch": 0.41, + "grad_norm": 1.7602990865707397, + "learning_rate": 6.6159547795290214e-06, + "loss": 0.5653, + "step": 3227 + }, + { + "epoch": 0.41, + "grad_norm": 1.151167869567871, + "learning_rate": 6.613990971192152e-06, + "loss": 0.5097, + "step": 3228 + }, + { + "epoch": 0.41, + "grad_norm": 1.2814674377441406, + "learning_rate": 6.6120268848781445e-06, + "loss": 0.6207, + "step": 3229 + }, + { + "epoch": 0.41, + "grad_norm": 1.2942862510681152, + "learning_rate": 6.610062520925271e-06, + "loss": 0.6376, + "step": 3230 + }, + { + "epoch": 0.41, + "grad_norm": 2.0902457237243652, + "learning_rate": 6.608097879671853e-06, + "loss": 0.6226, + "step": 3231 + }, + { + "epoch": 0.41, + "grad_norm": 1.3513823747634888, + "learning_rate": 6.606132961456264e-06, + "loss": 0.6445, + "step": 3232 + }, + { + "epoch": 0.41, + "grad_norm": 1.0098199844360352, + "learning_rate": 6.604167766616916e-06, + "loss": 0.564, + "step": 3233 + }, + { + "epoch": 0.41, + "grad_norm": 1.3201072216033936, + "learning_rate": 6.602202295492277e-06, + "loss": 0.5242, + "step": 3234 + }, + { + "epoch": 0.41, + "grad_norm": 1.1454012393951416, + "learning_rate": 6.600236548420858e-06, + "loss": 0.6332, + "step": 3235 + }, + { + "epoch": 0.41, + "grad_norm": 1.72659170627594, + "learning_rate": 6.59827052574122e-06, + "loss": 0.6348, + "step": 3236 + }, + { + "epoch": 0.41, + "grad_norm": 1.327935814857483, + "learning_rate": 6.59630422779197e-06, + "loss": 0.5882, + "step": 3237 + }, + { + "epoch": 0.41, + "grad_norm": 1.219178557395935, + "learning_rate": 6.594337654911761e-06, + "loss": 0.72, + "step": 3238 + }, + { + "epoch": 0.41, + "grad_norm": 1.1300914287567139, + "learning_rate": 6.592370807439299e-06, + "loss": 0.6251, + "step": 3239 + }, + { + "epoch": 0.42, + "grad_norm": 1.0695475339889526, + "learning_rate": 6.5904036857133315e-06, + "loss": 0.5239, + "step": 3240 + }, + { + "epoch": 0.42, + "grad_norm": 1.289373755455017, + "learning_rate": 6.588436290072655e-06, + "loss": 0.5697, + "step": 3241 + }, + { + "epoch": 0.42, + "grad_norm": 1.647684931755066, + "learning_rate": 6.586468620856114e-06, + "loss": 0.6084, + "step": 3242 + }, + { + "epoch": 0.42, + "grad_norm": 1.1704531908035278, + "learning_rate": 6.5845006784025985e-06, + "loss": 0.7122, + "step": 3243 + }, + { + "epoch": 0.42, + "grad_norm": 1.6329240798950195, + "learning_rate": 6.582532463051048e-06, + "loss": 0.6969, + "step": 3244 + }, + { + "epoch": 0.42, + "grad_norm": 0.9630757570266724, + "learning_rate": 6.580563975140447e-06, + "loss": 0.6295, + "step": 3245 + }, + { + "epoch": 0.42, + "grad_norm": 1.0210407972335815, + "learning_rate": 6.578595215009827e-06, + "loss": 0.6567, + "step": 3246 + }, + { + "epoch": 0.42, + "grad_norm": 1.281633734703064, + "learning_rate": 6.576626182998267e-06, + "loss": 0.5795, + "step": 3247 + }, + { + "epoch": 0.42, + "grad_norm": 1.1632440090179443, + "learning_rate": 6.574656879444894e-06, + "loss": 0.592, + "step": 3248 + }, + { + "epoch": 0.42, + "grad_norm": 1.1041927337646484, + "learning_rate": 6.5726873046888795e-06, + "loss": 0.6129, + "step": 3249 + }, + { + "epoch": 0.42, + "grad_norm": 1.37405526638031, + "learning_rate": 6.570717459069442e-06, + "loss": 0.7053, + "step": 3250 + }, + { + "epoch": 0.42, + "grad_norm": 1.7148224115371704, + "learning_rate": 6.56874734292585e-06, + "loss": 0.6797, + "step": 3251 + }, + { + "epoch": 0.42, + "grad_norm": 1.98465096950531, + "learning_rate": 6.5667769565974126e-06, + "loss": 0.643, + "step": 3252 + }, + { + "epoch": 0.42, + "grad_norm": 1.3943524360656738, + "learning_rate": 6.56480630042349e-06, + "loss": 0.6549, + "step": 3253 + }, + { + "epoch": 0.42, + "grad_norm": 1.2270232439041138, + "learning_rate": 6.562835374743488e-06, + "loss": 0.6189, + "step": 3254 + }, + { + "epoch": 0.42, + "grad_norm": 1.2784587144851685, + "learning_rate": 6.5608641798968596e-06, + "loss": 0.6567, + "step": 3255 + }, + { + "epoch": 0.42, + "grad_norm": 1.17548406124115, + "learning_rate": 6.558892716223102e-06, + "loss": 0.7443, + "step": 3256 + }, + { + "epoch": 0.42, + "grad_norm": 1.4248778820037842, + "learning_rate": 6.556920984061759e-06, + "loss": 0.6288, + "step": 3257 + }, + { + "epoch": 0.42, + "grad_norm": 4.57545804977417, + "learning_rate": 6.554948983752423e-06, + "loss": 0.7004, + "step": 3258 + }, + { + "epoch": 0.42, + "grad_norm": 1.1954271793365479, + "learning_rate": 6.552976715634729e-06, + "loss": 0.5588, + "step": 3259 + }, + { + "epoch": 0.42, + "grad_norm": 1.479419231414795, + "learning_rate": 6.551004180048361e-06, + "loss": 0.6446, + "step": 3260 + }, + { + "epoch": 0.42, + "grad_norm": 1.04899001121521, + "learning_rate": 6.549031377333049e-06, + "loss": 0.5631, + "step": 3261 + }, + { + "epoch": 0.42, + "grad_norm": 1.1985399723052979, + "learning_rate": 6.5470583078285685e-06, + "loss": 0.5939, + "step": 3262 + }, + { + "epoch": 0.42, + "grad_norm": 1.081529974937439, + "learning_rate": 6.545084971874738e-06, + "loss": 0.6133, + "step": 3263 + }, + { + "epoch": 0.42, + "grad_norm": 1.3592029809951782, + "learning_rate": 6.5431113698114255e-06, + "loss": 0.6758, + "step": 3264 + }, + { + "epoch": 0.42, + "grad_norm": 1.38568115234375, + "learning_rate": 6.541137501978547e-06, + "loss": 0.6327, + "step": 3265 + }, + { + "epoch": 0.42, + "grad_norm": 1.2125675678253174, + "learning_rate": 6.539163368716057e-06, + "loss": 0.5552, + "step": 3266 + }, + { + "epoch": 0.42, + "grad_norm": 1.0356985330581665, + "learning_rate": 6.537188970363961e-06, + "loss": 0.5857, + "step": 3267 + }, + { + "epoch": 0.42, + "grad_norm": 1.4390578269958496, + "learning_rate": 6.53521430726231e-06, + "loss": 0.6635, + "step": 3268 + }, + { + "epoch": 0.42, + "grad_norm": 1.1902811527252197, + "learning_rate": 6.5332393797512e-06, + "loss": 0.73, + "step": 3269 + }, + { + "epoch": 0.42, + "grad_norm": 1.1815040111541748, + "learning_rate": 6.531264188170769e-06, + "loss": 0.678, + "step": 3270 + }, + { + "epoch": 0.42, + "grad_norm": 1.6778348684310913, + "learning_rate": 6.529288732861207e-06, + "loss": 0.6705, + "step": 3271 + }, + { + "epoch": 0.42, + "grad_norm": 1.5308274030685425, + "learning_rate": 6.527313014162745e-06, + "loss": 0.6068, + "step": 3272 + }, + { + "epoch": 0.42, + "grad_norm": 1.2709729671478271, + "learning_rate": 6.525337032415658e-06, + "loss": 0.5982, + "step": 3273 + }, + { + "epoch": 0.42, + "grad_norm": 1.3264434337615967, + "learning_rate": 6.523360787960273e-06, + "loss": 0.599, + "step": 3274 + }, + { + "epoch": 0.42, + "grad_norm": 2.82342791557312, + "learning_rate": 6.521384281136955e-06, + "loss": 0.6239, + "step": 3275 + }, + { + "epoch": 0.42, + "grad_norm": 1.3357826471328735, + "learning_rate": 6.519407512286119e-06, + "loss": 0.5771, + "step": 3276 + }, + { + "epoch": 0.42, + "grad_norm": 1.4777655601501465, + "learning_rate": 6.51743048174822e-06, + "loss": 0.6048, + "step": 3277 + }, + { + "epoch": 0.42, + "grad_norm": 1.4173051118850708, + "learning_rate": 6.515453189863765e-06, + "loss": 0.5204, + "step": 3278 + }, + { + "epoch": 0.42, + "grad_norm": 1.2135194540023804, + "learning_rate": 6.513475636973301e-06, + "loss": 0.5671, + "step": 3279 + }, + { + "epoch": 0.42, + "grad_norm": 1.3434330224990845, + "learning_rate": 6.511497823417418e-06, + "loss": 0.576, + "step": 3280 + }, + { + "epoch": 0.42, + "grad_norm": 1.3612425327301025, + "learning_rate": 6.50951974953676e-06, + "loss": 0.6269, + "step": 3281 + }, + { + "epoch": 0.42, + "grad_norm": 1.213051438331604, + "learning_rate": 6.507541415672007e-06, + "loss": 0.6763, + "step": 3282 + }, + { + "epoch": 0.42, + "grad_norm": 1.151252031326294, + "learning_rate": 6.505562822163887e-06, + "loss": 0.556, + "step": 3283 + }, + { + "epoch": 0.42, + "grad_norm": 2.0518524646759033, + "learning_rate": 6.503583969353173e-06, + "loss": 0.7004, + "step": 3284 + }, + { + "epoch": 0.42, + "grad_norm": 1.3543423414230347, + "learning_rate": 6.501604857580681e-06, + "loss": 0.6353, + "step": 3285 + }, + { + "epoch": 0.42, + "grad_norm": 1.3768010139465332, + "learning_rate": 6.499625487187276e-06, + "loss": 0.5926, + "step": 3286 + }, + { + "epoch": 0.42, + "grad_norm": 1.7199901342391968, + "learning_rate": 6.497645858513858e-06, + "loss": 0.631, + "step": 3287 + }, + { + "epoch": 0.42, + "grad_norm": 1.2099894285202026, + "learning_rate": 6.4956659719013835e-06, + "loss": 0.5909, + "step": 3288 + }, + { + "epoch": 0.42, + "grad_norm": 1.031480312347412, + "learning_rate": 6.493685827690846e-06, + "loss": 0.6396, + "step": 3289 + }, + { + "epoch": 0.42, + "grad_norm": 1.2236201763153076, + "learning_rate": 6.491705426223285e-06, + "loss": 0.6766, + "step": 3290 + }, + { + "epoch": 0.42, + "grad_norm": 1.6073354482650757, + "learning_rate": 6.4897247678397845e-06, + "loss": 0.6743, + "step": 3291 + }, + { + "epoch": 0.42, + "grad_norm": 1.1248873472213745, + "learning_rate": 6.487743852881472e-06, + "loss": 0.5832, + "step": 3292 + }, + { + "epoch": 0.42, + "grad_norm": 1.0797927379608154, + "learning_rate": 6.485762681689521e-06, + "loss": 0.6391, + "step": 3293 + }, + { + "epoch": 0.42, + "grad_norm": 1.4567921161651611, + "learning_rate": 6.483781254605146e-06, + "loss": 0.6303, + "step": 3294 + }, + { + "epoch": 0.42, + "grad_norm": 1.9019827842712402, + "learning_rate": 6.48179957196961e-06, + "loss": 0.5922, + "step": 3295 + }, + { + "epoch": 0.42, + "grad_norm": 1.6069629192352295, + "learning_rate": 6.479817634124216e-06, + "loss": 0.6879, + "step": 3296 + }, + { + "epoch": 0.42, + "grad_norm": 1.3667546510696411, + "learning_rate": 6.477835441410311e-06, + "loss": 0.65, + "step": 3297 + }, + { + "epoch": 0.42, + "grad_norm": 1.415429711341858, + "learning_rate": 6.475852994169294e-06, + "loss": 0.6479, + "step": 3298 + }, + { + "epoch": 0.42, + "grad_norm": 1.2002050876617432, + "learning_rate": 6.473870292742592e-06, + "loss": 0.5623, + "step": 3299 + }, + { + "epoch": 0.42, + "grad_norm": 1.5991908311843872, + "learning_rate": 6.471887337471693e-06, + "loss": 0.6927, + "step": 3300 + }, + { + "epoch": 0.42, + "grad_norm": 1.442886471748352, + "learning_rate": 6.4699041286981155e-06, + "loss": 0.6233, + "step": 3301 + }, + { + "epoch": 0.42, + "grad_norm": 1.619641661643982, + "learning_rate": 6.46792066676343e-06, + "loss": 0.6882, + "step": 3302 + }, + { + "epoch": 0.42, + "grad_norm": 1.233254313468933, + "learning_rate": 6.465936952009245e-06, + "loss": 0.6169, + "step": 3303 + }, + { + "epoch": 0.42, + "grad_norm": 1.1515586376190186, + "learning_rate": 6.463952984777218e-06, + "loss": 0.5435, + "step": 3304 + }, + { + "epoch": 0.42, + "grad_norm": 1.2402045726776123, + "learning_rate": 6.461968765409041e-06, + "loss": 0.7376, + "step": 3305 + }, + { + "epoch": 0.42, + "grad_norm": 1.3234292268753052, + "learning_rate": 6.4599842942464645e-06, + "loss": 0.7769, + "step": 3306 + }, + { + "epoch": 0.42, + "grad_norm": 1.9401755332946777, + "learning_rate": 6.457999571631266e-06, + "loss": 0.6768, + "step": 3307 + }, + { + "epoch": 0.42, + "grad_norm": 2.493912696838379, + "learning_rate": 6.456014597905278e-06, + "loss": 0.5967, + "step": 3308 + }, + { + "epoch": 0.42, + "grad_norm": 1.2056974172592163, + "learning_rate": 6.454029373410369e-06, + "loss": 0.5962, + "step": 3309 + }, + { + "epoch": 0.42, + "grad_norm": 1.219796061515808, + "learning_rate": 6.452043898488456e-06, + "loss": 0.6247, + "step": 3310 + }, + { + "epoch": 0.42, + "grad_norm": 1.3913002014160156, + "learning_rate": 6.450058173481493e-06, + "loss": 0.5459, + "step": 3311 + }, + { + "epoch": 0.42, + "grad_norm": 1.591672658920288, + "learning_rate": 6.448072198731485e-06, + "loss": 0.5614, + "step": 3312 + }, + { + "epoch": 0.42, + "grad_norm": 1.4829857349395752, + "learning_rate": 6.4460859745804735e-06, + "loss": 0.6168, + "step": 3313 + }, + { + "epoch": 0.42, + "grad_norm": 1.0522713661193848, + "learning_rate": 6.444099501370545e-06, + "loss": 0.5752, + "step": 3314 + }, + { + "epoch": 0.42, + "grad_norm": 1.390140175819397, + "learning_rate": 6.442112779443832e-06, + "loss": 0.6122, + "step": 3315 + }, + { + "epoch": 0.42, + "grad_norm": 1.25529944896698, + "learning_rate": 6.440125809142503e-06, + "loss": 0.5556, + "step": 3316 + }, + { + "epoch": 0.42, + "grad_norm": 1.3403065204620361, + "learning_rate": 6.438138590808776e-06, + "loss": 0.6308, + "step": 3317 + }, + { + "epoch": 0.43, + "grad_norm": 1.25216543674469, + "learning_rate": 6.436151124784906e-06, + "loss": 0.6417, + "step": 3318 + }, + { + "epoch": 0.43, + "grad_norm": 1.2277629375457764, + "learning_rate": 6.434163411413197e-06, + "loss": 0.6425, + "step": 3319 + }, + { + "epoch": 0.43, + "grad_norm": 1.0705773830413818, + "learning_rate": 6.432175451035991e-06, + "loss": 0.5152, + "step": 3320 + }, + { + "epoch": 0.43, + "grad_norm": 1.466159701347351, + "learning_rate": 6.430187243995674e-06, + "loss": 0.6207, + "step": 3321 + }, + { + "epoch": 0.43, + "grad_norm": 1.1634601354599, + "learning_rate": 6.428198790634672e-06, + "loss": 0.5681, + "step": 3322 + }, + { + "epoch": 0.43, + "grad_norm": 1.6939680576324463, + "learning_rate": 6.4262100912954585e-06, + "loss": 0.593, + "step": 3323 + }, + { + "epoch": 0.43, + "grad_norm": 1.3650009632110596, + "learning_rate": 6.424221146320547e-06, + "loss": 0.6422, + "step": 3324 + }, + { + "epoch": 0.43, + "grad_norm": 1.1898249387741089, + "learning_rate": 6.422231956052489e-06, + "loss": 0.6359, + "step": 3325 + }, + { + "epoch": 0.43, + "grad_norm": 1.2429956197738647, + "learning_rate": 6.420242520833886e-06, + "loss": 0.6454, + "step": 3326 + }, + { + "epoch": 0.43, + "grad_norm": 1.1976206302642822, + "learning_rate": 6.418252841007376e-06, + "loss": 0.5683, + "step": 3327 + }, + { + "epoch": 0.43, + "grad_norm": 1.0774952173233032, + "learning_rate": 6.416262916915642e-06, + "loss": 0.6135, + "step": 3328 + }, + { + "epoch": 0.43, + "grad_norm": 1.288520336151123, + "learning_rate": 6.414272748901405e-06, + "loss": 0.6605, + "step": 3329 + }, + { + "epoch": 0.43, + "grad_norm": 2.0824525356292725, + "learning_rate": 6.412282337307436e-06, + "loss": 0.5897, + "step": 3330 + }, + { + "epoch": 0.43, + "grad_norm": 1.8679497241973877, + "learning_rate": 6.410291682476537e-06, + "loss": 0.5329, + "step": 3331 + }, + { + "epoch": 0.43, + "grad_norm": 1.217800498008728, + "learning_rate": 6.4083007847515634e-06, + "loss": 0.6684, + "step": 3332 + }, + { + "epoch": 0.43, + "grad_norm": 1.3628228902816772, + "learning_rate": 6.406309644475404e-06, + "loss": 0.6208, + "step": 3333 + }, + { + "epoch": 0.43, + "grad_norm": 1.4911750555038452, + "learning_rate": 6.404318261990992e-06, + "loss": 0.6059, + "step": 3334 + }, + { + "epoch": 0.43, + "grad_norm": 1.5707592964172363, + "learning_rate": 6.402326637641303e-06, + "loss": 0.5948, + "step": 3335 + }, + { + "epoch": 0.43, + "grad_norm": 1.3157973289489746, + "learning_rate": 6.400334771769355e-06, + "loss": 0.6137, + "step": 3336 + }, + { + "epoch": 0.43, + "grad_norm": 1.2203538417816162, + "learning_rate": 6.398342664718204e-06, + "loss": 0.607, + "step": 3337 + }, + { + "epoch": 0.43, + "grad_norm": 1.2806761264801025, + "learning_rate": 6.396350316830954e-06, + "loss": 0.5865, + "step": 3338 + }, + { + "epoch": 0.43, + "grad_norm": 1.370572805404663, + "learning_rate": 6.394357728450741e-06, + "loss": 0.6144, + "step": 3339 + }, + { + "epoch": 0.43, + "grad_norm": 1.2665327787399292, + "learning_rate": 6.392364899920751e-06, + "loss": 0.5818, + "step": 3340 + }, + { + "epoch": 0.43, + "grad_norm": 1.244334101676941, + "learning_rate": 6.39037183158421e-06, + "loss": 0.6458, + "step": 3341 + }, + { + "epoch": 0.43, + "grad_norm": 1.2308834791183472, + "learning_rate": 6.388378523784379e-06, + "loss": 0.7208, + "step": 3342 + }, + { + "epoch": 0.43, + "grad_norm": 1.2577228546142578, + "learning_rate": 6.386384976864569e-06, + "loss": 0.7069, + "step": 3343 + }, + { + "epoch": 0.43, + "grad_norm": 1.1200151443481445, + "learning_rate": 6.384391191168124e-06, + "loss": 0.6288, + "step": 3344 + }, + { + "epoch": 0.43, + "grad_norm": 1.3991978168487549, + "learning_rate": 6.382397167038438e-06, + "loss": 0.616, + "step": 3345 + }, + { + "epoch": 0.43, + "grad_norm": 1.689049482345581, + "learning_rate": 6.3804029048189355e-06, + "loss": 0.6469, + "step": 3346 + }, + { + "epoch": 0.43, + "grad_norm": 1.1596959829330444, + "learning_rate": 6.378408404853093e-06, + "loss": 0.538, + "step": 3347 + }, + { + "epoch": 0.43, + "grad_norm": 1.1679819822311401, + "learning_rate": 6.376413667484417e-06, + "loss": 0.7596, + "step": 3348 + }, + { + "epoch": 0.43, + "grad_norm": 1.8966916799545288, + "learning_rate": 6.374418693056464e-06, + "loss": 0.6038, + "step": 3349 + }, + { + "epoch": 0.43, + "grad_norm": 1.218559741973877, + "learning_rate": 6.37242348191283e-06, + "loss": 0.6026, + "step": 3350 + }, + { + "epoch": 0.43, + "grad_norm": 1.2759473323822021, + "learning_rate": 6.370428034397144e-06, + "loss": 0.5726, + "step": 3351 + }, + { + "epoch": 0.43, + "grad_norm": 1.0636835098266602, + "learning_rate": 6.368432350853085e-06, + "loss": 0.6214, + "step": 3352 + }, + { + "epoch": 0.43, + "grad_norm": 1.555844783782959, + "learning_rate": 6.366436431624368e-06, + "loss": 0.622, + "step": 3353 + }, + { + "epoch": 0.43, + "grad_norm": 1.8688308000564575, + "learning_rate": 6.3644402770547496e-06, + "loss": 0.6356, + "step": 3354 + }, + { + "epoch": 0.43, + "grad_norm": 0.9006859660148621, + "learning_rate": 6.3624438874880256e-06, + "loss": 0.6228, + "step": 3355 + }, + { + "epoch": 0.43, + "grad_norm": 1.367267370223999, + "learning_rate": 6.360447263268037e-06, + "loss": 0.5737, + "step": 3356 + }, + { + "epoch": 0.43, + "grad_norm": 1.1289016008377075, + "learning_rate": 6.358450404738656e-06, + "loss": 0.6597, + "step": 3357 + }, + { + "epoch": 0.43, + "grad_norm": 1.5956635475158691, + "learning_rate": 6.356453312243807e-06, + "loss": 0.5691, + "step": 3358 + }, + { + "epoch": 0.43, + "grad_norm": 1.2158634662628174, + "learning_rate": 6.354455986127445e-06, + "loss": 0.5911, + "step": 3359 + }, + { + "epoch": 0.43, + "grad_norm": 2.0845446586608887, + "learning_rate": 6.352458426733571e-06, + "loss": 0.5655, + "step": 3360 + }, + { + "epoch": 0.43, + "grad_norm": 1.3381253480911255, + "learning_rate": 6.3504606344062215e-06, + "loss": 0.6276, + "step": 3361 + }, + { + "epoch": 0.43, + "grad_norm": 1.382815957069397, + "learning_rate": 6.348462609489477e-06, + "loss": 0.6372, + "step": 3362 + }, + { + "epoch": 0.43, + "grad_norm": 1.607489824295044, + "learning_rate": 6.346464352327456e-06, + "loss": 0.7139, + "step": 3363 + }, + { + "epoch": 0.43, + "grad_norm": 1.351166009902954, + "learning_rate": 6.3444658632643195e-06, + "loss": 0.5938, + "step": 3364 + }, + { + "epoch": 0.43, + "grad_norm": 2.418544292449951, + "learning_rate": 6.342467142644264e-06, + "loss": 0.5654, + "step": 3365 + }, + { + "epoch": 0.43, + "grad_norm": 1.2091163396835327, + "learning_rate": 6.340468190811531e-06, + "loss": 0.6736, + "step": 3366 + }, + { + "epoch": 0.43, + "grad_norm": 1.4336068630218506, + "learning_rate": 6.338469008110399e-06, + "loss": 0.693, + "step": 3367 + }, + { + "epoch": 0.43, + "grad_norm": 1.4184355735778809, + "learning_rate": 6.336469594885183e-06, + "loss": 0.6291, + "step": 3368 + }, + { + "epoch": 0.43, + "grad_norm": 1.5481655597686768, + "learning_rate": 6.3344699514802465e-06, + "loss": 0.6017, + "step": 3369 + }, + { + "epoch": 0.43, + "grad_norm": 1.1255567073822021, + "learning_rate": 6.332470078239983e-06, + "loss": 0.6172, + "step": 3370 + }, + { + "epoch": 0.43, + "grad_norm": 1.1182490587234497, + "learning_rate": 6.330469975508834e-06, + "loss": 0.584, + "step": 3371 + }, + { + "epoch": 0.43, + "grad_norm": 1.1752885580062866, + "learning_rate": 6.328469643631273e-06, + "loss": 0.7249, + "step": 3372 + }, + { + "epoch": 0.43, + "grad_norm": 1.3815187215805054, + "learning_rate": 6.326469082951819e-06, + "loss": 0.635, + "step": 3373 + }, + { + "epoch": 0.43, + "grad_norm": 1.2596144676208496, + "learning_rate": 6.3244682938150255e-06, + "loss": 0.5281, + "step": 3374 + }, + { + "epoch": 0.43, + "grad_norm": 1.3565584421157837, + "learning_rate": 6.3224672765654905e-06, + "loss": 0.6269, + "step": 3375 + }, + { + "epoch": 0.43, + "grad_norm": 1.5856893062591553, + "learning_rate": 6.320466031547847e-06, + "loss": 0.6211, + "step": 3376 + }, + { + "epoch": 0.43, + "grad_norm": 1.1341127157211304, + "learning_rate": 6.31846455910677e-06, + "loss": 0.5976, + "step": 3377 + }, + { + "epoch": 0.43, + "grad_norm": 1.2035582065582275, + "learning_rate": 6.316462859586971e-06, + "loss": 0.5688, + "step": 3378 + }, + { + "epoch": 0.43, + "grad_norm": 1.234596610069275, + "learning_rate": 6.314460933333201e-06, + "loss": 0.613, + "step": 3379 + }, + { + "epoch": 0.43, + "grad_norm": 1.051673412322998, + "learning_rate": 6.312458780690254e-06, + "loss": 0.4786, + "step": 3380 + }, + { + "epoch": 0.43, + "grad_norm": 1.180688500404358, + "learning_rate": 6.310456402002958e-06, + "loss": 0.6154, + "step": 3381 + }, + { + "epoch": 0.43, + "grad_norm": 1.144311547279358, + "learning_rate": 6.308453797616184e-06, + "loss": 0.7581, + "step": 3382 + }, + { + "epoch": 0.43, + "grad_norm": 1.3280012607574463, + "learning_rate": 6.306450967874836e-06, + "loss": 0.6299, + "step": 3383 + }, + { + "epoch": 0.43, + "grad_norm": 1.2186527252197266, + "learning_rate": 6.304447913123866e-06, + "loss": 0.5792, + "step": 3384 + }, + { + "epoch": 0.43, + "grad_norm": 1.3583048582077026, + "learning_rate": 6.3024446337082555e-06, + "loss": 0.6775, + "step": 3385 + }, + { + "epoch": 0.43, + "grad_norm": 1.3401097059249878, + "learning_rate": 6.300441129973032e-06, + "loss": 0.6299, + "step": 3386 + }, + { + "epoch": 0.43, + "grad_norm": 1.4296989440917969, + "learning_rate": 6.298437402263254e-06, + "loss": 0.6503, + "step": 3387 + }, + { + "epoch": 0.43, + "grad_norm": 1.3577499389648438, + "learning_rate": 6.296433450924027e-06, + "loss": 0.6072, + "step": 3388 + }, + { + "epoch": 0.43, + "grad_norm": 1.4429153203964233, + "learning_rate": 6.2944292763004885e-06, + "loss": 0.5407, + "step": 3389 + }, + { + "epoch": 0.43, + "grad_norm": 1.1351635456085205, + "learning_rate": 6.292424878737817e-06, + "loss": 0.5831, + "step": 3390 + }, + { + "epoch": 0.43, + "grad_norm": 1.355637550354004, + "learning_rate": 6.290420258581229e-06, + "loss": 0.6465, + "step": 3391 + }, + { + "epoch": 0.43, + "grad_norm": 1.0525286197662354, + "learning_rate": 6.288415416175981e-06, + "loss": 0.5857, + "step": 3392 + }, + { + "epoch": 0.43, + "grad_norm": 1.313124656677246, + "learning_rate": 6.286410351867367e-06, + "loss": 0.5992, + "step": 3393 + }, + { + "epoch": 0.43, + "grad_norm": 2.2575294971466064, + "learning_rate": 6.284405066000715e-06, + "loss": 0.5893, + "step": 3394 + }, + { + "epoch": 0.43, + "grad_norm": 1.1958906650543213, + "learning_rate": 6.282399558921398e-06, + "loss": 0.5755, + "step": 3395 + }, + { + "epoch": 0.44, + "grad_norm": 1.1793793439865112, + "learning_rate": 6.280393830974822e-06, + "loss": 0.6098, + "step": 3396 + }, + { + "epoch": 0.44, + "grad_norm": 1.2796311378479004, + "learning_rate": 6.278387882506434e-06, + "loss": 0.6819, + "step": 3397 + }, + { + "epoch": 0.44, + "grad_norm": 1.8948205709457397, + "learning_rate": 6.276381713861717e-06, + "loss": 0.6131, + "step": 3398 + }, + { + "epoch": 0.44, + "grad_norm": 1.889975905418396, + "learning_rate": 6.274375325386195e-06, + "loss": 0.6074, + "step": 3399 + }, + { + "epoch": 0.44, + "grad_norm": 1.6620714664459229, + "learning_rate": 6.272368717425423e-06, + "loss": 0.5899, + "step": 3400 + }, + { + "epoch": 0.44, + "grad_norm": 1.221790075302124, + "learning_rate": 6.270361890325003e-06, + "loss": 0.5863, + "step": 3401 + }, + { + "epoch": 0.44, + "grad_norm": 1.1769517660140991, + "learning_rate": 6.268354844430569e-06, + "loss": 0.5553, + "step": 3402 + }, + { + "epoch": 0.44, + "grad_norm": 1.1347368955612183, + "learning_rate": 6.266347580087791e-06, + "loss": 0.7537, + "step": 3403 + }, + { + "epoch": 0.44, + "grad_norm": 1.2931528091430664, + "learning_rate": 6.264340097642385e-06, + "loss": 0.5663, + "step": 3404 + }, + { + "epoch": 0.44, + "grad_norm": 1.5123640298843384, + "learning_rate": 6.2623323974400944e-06, + "loss": 0.571, + "step": 3405 + }, + { + "epoch": 0.44, + "grad_norm": 1.1496232748031616, + "learning_rate": 6.260324479826706e-06, + "loss": 0.7423, + "step": 3406 + }, + { + "epoch": 0.44, + "grad_norm": 1.2281783819198608, + "learning_rate": 6.258316345148042e-06, + "loss": 0.6239, + "step": 3407 + }, + { + "epoch": 0.44, + "grad_norm": 1.229547142982483, + "learning_rate": 6.256307993749965e-06, + "loss": 0.5493, + "step": 3408 + }, + { + "epoch": 0.44, + "grad_norm": 1.5989558696746826, + "learning_rate": 6.254299425978371e-06, + "loss": 0.5408, + "step": 3409 + }, + { + "epoch": 0.44, + "grad_norm": 1.251488208770752, + "learning_rate": 6.252290642179197e-06, + "loss": 0.6038, + "step": 3410 + }, + { + "epoch": 0.44, + "grad_norm": 3.296891450881958, + "learning_rate": 6.25028164269841e-06, + "loss": 0.6108, + "step": 3411 + }, + { + "epoch": 0.44, + "grad_norm": 1.524884819984436, + "learning_rate": 6.248272427882027e-06, + "loss": 0.683, + "step": 3412 + }, + { + "epoch": 0.44, + "grad_norm": 1.0750436782836914, + "learning_rate": 6.246262998076088e-06, + "loss": 0.69, + "step": 3413 + }, + { + "epoch": 0.44, + "grad_norm": 1.3640230894088745, + "learning_rate": 6.24425335362668e-06, + "loss": 0.6142, + "step": 3414 + }, + { + "epoch": 0.44, + "grad_norm": 1.7753424644470215, + "learning_rate": 6.242243494879923e-06, + "loss": 0.6733, + "step": 3415 + }, + { + "epoch": 0.44, + "grad_norm": 1.3016899824142456, + "learning_rate": 6.240233422181972e-06, + "loss": 0.5632, + "step": 3416 + }, + { + "epoch": 0.44, + "grad_norm": 1.370159387588501, + "learning_rate": 6.2382231358790224e-06, + "loss": 0.6562, + "step": 3417 + }, + { + "epoch": 0.44, + "grad_norm": 1.9591825008392334, + "learning_rate": 6.236212636317305e-06, + "loss": 0.6253, + "step": 3418 + }, + { + "epoch": 0.44, + "grad_norm": 1.2164580821990967, + "learning_rate": 6.23420192384309e-06, + "loss": 0.5545, + "step": 3419 + }, + { + "epoch": 0.44, + "grad_norm": 1.5938736200332642, + "learning_rate": 6.232190998802679e-06, + "loss": 0.5702, + "step": 3420 + }, + { + "epoch": 0.44, + "grad_norm": 1.2773017883300781, + "learning_rate": 6.230179861542413e-06, + "loss": 0.6788, + "step": 3421 + }, + { + "epoch": 0.44, + "grad_norm": 1.3805594444274902, + "learning_rate": 6.2281685124086714e-06, + "loss": 0.5953, + "step": 3422 + }, + { + "epoch": 0.44, + "grad_norm": 1.4324122667312622, + "learning_rate": 6.226156951747866e-06, + "loss": 0.5984, + "step": 3423 + }, + { + "epoch": 0.44, + "grad_norm": 1.1922259330749512, + "learning_rate": 6.224145179906448e-06, + "loss": 0.6201, + "step": 3424 + }, + { + "epoch": 0.44, + "grad_norm": 1.930484652519226, + "learning_rate": 6.222133197230904e-06, + "loss": 0.6431, + "step": 3425 + }, + { + "epoch": 0.44, + "grad_norm": 1.209879755973816, + "learning_rate": 6.22012100406776e-06, + "loss": 0.5888, + "step": 3426 + }, + { + "epoch": 0.44, + "grad_norm": 1.4153494834899902, + "learning_rate": 6.21810860076357e-06, + "loss": 0.6015, + "step": 3427 + }, + { + "epoch": 0.44, + "grad_norm": 1.4506012201309204, + "learning_rate": 6.216095987664935e-06, + "loss": 0.6142, + "step": 3428 + }, + { + "epoch": 0.44, + "grad_norm": 1.0460131168365479, + "learning_rate": 6.214083165118483e-06, + "loss": 0.5344, + "step": 3429 + }, + { + "epoch": 0.44, + "grad_norm": 1.4760301113128662, + "learning_rate": 6.212070133470884e-06, + "loss": 0.6131, + "step": 3430 + }, + { + "epoch": 0.44, + "grad_norm": 1.3164763450622559, + "learning_rate": 6.210056893068839e-06, + "loss": 0.607, + "step": 3431 + }, + { + "epoch": 0.44, + "grad_norm": 1.4441686868667603, + "learning_rate": 6.208043444259091e-06, + "loss": 0.7008, + "step": 3432 + }, + { + "epoch": 0.44, + "grad_norm": 1.0722826719284058, + "learning_rate": 6.206029787388412e-06, + "loss": 0.5562, + "step": 3433 + }, + { + "epoch": 0.44, + "grad_norm": 1.1804120540618896, + "learning_rate": 6.204015922803617e-06, + "loss": 0.5505, + "step": 3434 + }, + { + "epoch": 0.44, + "grad_norm": 1.25029718875885, + "learning_rate": 6.202001850851551e-06, + "loss": 0.6006, + "step": 3435 + }, + { + "epoch": 0.44, + "grad_norm": 1.1206964254379272, + "learning_rate": 6.199987571879101e-06, + "loss": 0.7474, + "step": 3436 + }, + { + "epoch": 0.44, + "grad_norm": 1.8449739217758179, + "learning_rate": 6.197973086233179e-06, + "loss": 0.651, + "step": 3437 + }, + { + "epoch": 0.44, + "grad_norm": 1.151208758354187, + "learning_rate": 6.195958394260744e-06, + "loss": 0.5435, + "step": 3438 + }, + { + "epoch": 0.44, + "grad_norm": 1.3795284032821655, + "learning_rate": 6.1939434963087845e-06, + "loss": 0.6664, + "step": 3439 + }, + { + "epoch": 0.44, + "grad_norm": 1.3607579469680786, + "learning_rate": 6.191928392724325e-06, + "loss": 0.5944, + "step": 3440 + }, + { + "epoch": 0.44, + "grad_norm": 1.5826529264450073, + "learning_rate": 6.1899130838544275e-06, + "loss": 0.645, + "step": 3441 + }, + { + "epoch": 0.44, + "grad_norm": 1.0307071208953857, + "learning_rate": 6.187897570046184e-06, + "loss": 0.5332, + "step": 3442 + }, + { + "epoch": 0.44, + "grad_norm": 1.3897781372070312, + "learning_rate": 6.185881851646732e-06, + "loss": 0.652, + "step": 3443 + }, + { + "epoch": 0.44, + "grad_norm": 0.9002603888511658, + "learning_rate": 6.1838659290032346e-06, + "loss": 0.5409, + "step": 3444 + }, + { + "epoch": 0.44, + "grad_norm": 1.2274670600891113, + "learning_rate": 6.181849802462895e-06, + "loss": 0.5836, + "step": 3445 + }, + { + "epoch": 0.44, + "grad_norm": 1.3531590700149536, + "learning_rate": 6.179833472372947e-06, + "loss": 0.598, + "step": 3446 + }, + { + "epoch": 0.44, + "grad_norm": 1.2528719902038574, + "learning_rate": 6.177816939080666e-06, + "loss": 0.5959, + "step": 3447 + }, + { + "epoch": 0.44, + "grad_norm": 1.0928164720535278, + "learning_rate": 6.175800202933355e-06, + "loss": 0.5685, + "step": 3448 + }, + { + "epoch": 0.44, + "grad_norm": 1.4518722295761108, + "learning_rate": 6.17378326427836e-06, + "loss": 0.6363, + "step": 3449 + }, + { + "epoch": 0.44, + "grad_norm": 1.127073884010315, + "learning_rate": 6.171766123463054e-06, + "loss": 0.79, + "step": 3450 + }, + { + "epoch": 0.44, + "grad_norm": 1.7623597383499146, + "learning_rate": 6.1697487808348525e-06, + "loss": 0.6219, + "step": 3451 + }, + { + "epoch": 0.44, + "grad_norm": 1.348456621170044, + "learning_rate": 6.167731236741199e-06, + "loss": 0.5674, + "step": 3452 + }, + { + "epoch": 0.44, + "grad_norm": 1.6265017986297607, + "learning_rate": 6.1657134915295735e-06, + "loss": 0.5659, + "step": 3453 + }, + { + "epoch": 0.44, + "grad_norm": 1.057990550994873, + "learning_rate": 6.163695545547494e-06, + "loss": 0.6351, + "step": 3454 + }, + { + "epoch": 0.44, + "grad_norm": 3.3038055896759033, + "learning_rate": 6.16167739914251e-06, + "loss": 0.6228, + "step": 3455 + }, + { + "epoch": 0.44, + "grad_norm": 1.0282056331634521, + "learning_rate": 6.159659052662207e-06, + "loss": 0.5298, + "step": 3456 + }, + { + "epoch": 0.44, + "grad_norm": 1.179836630821228, + "learning_rate": 6.157640506454201e-06, + "loss": 0.5805, + "step": 3457 + }, + { + "epoch": 0.44, + "grad_norm": 1.278550148010254, + "learning_rate": 6.15562176086615e-06, + "loss": 0.6652, + "step": 3458 + }, + { + "epoch": 0.44, + "grad_norm": 1.3121113777160645, + "learning_rate": 6.153602816245738e-06, + "loss": 0.6242, + "step": 3459 + }, + { + "epoch": 0.44, + "grad_norm": 1.404284119606018, + "learning_rate": 6.1515836729406874e-06, + "loss": 0.6148, + "step": 3460 + }, + { + "epoch": 0.44, + "grad_norm": 1.0569359064102173, + "learning_rate": 6.149564331298758e-06, + "loss": 0.605, + "step": 3461 + }, + { + "epoch": 0.44, + "grad_norm": 1.422186017036438, + "learning_rate": 6.147544791667738e-06, + "loss": 0.6296, + "step": 3462 + }, + { + "epoch": 0.44, + "grad_norm": 1.2817769050598145, + "learning_rate": 6.145525054395453e-06, + "loss": 0.6151, + "step": 3463 + }, + { + "epoch": 0.44, + "grad_norm": 3.4974191188812256, + "learning_rate": 6.143505119829759e-06, + "loss": 0.5476, + "step": 3464 + }, + { + "epoch": 0.44, + "grad_norm": 1.151680827140808, + "learning_rate": 6.141484988318554e-06, + "loss": 0.588, + "step": 3465 + }, + { + "epoch": 0.44, + "grad_norm": 1.0980405807495117, + "learning_rate": 6.139464660209757e-06, + "loss": 0.5949, + "step": 3466 + }, + { + "epoch": 0.44, + "grad_norm": 2.5018017292022705, + "learning_rate": 6.137444135851336e-06, + "loss": 0.5604, + "step": 3467 + }, + { + "epoch": 0.44, + "grad_norm": 1.1532684564590454, + "learning_rate": 6.1354234155912795e-06, + "loss": 0.6558, + "step": 3468 + }, + { + "epoch": 0.44, + "grad_norm": 1.3535691499710083, + "learning_rate": 6.133402499777621e-06, + "loss": 0.6153, + "step": 3469 + }, + { + "epoch": 0.44, + "grad_norm": 1.7174805402755737, + "learning_rate": 6.131381388758417e-06, + "loss": 0.5909, + "step": 3470 + }, + { + "epoch": 0.44, + "grad_norm": 1.1755709648132324, + "learning_rate": 6.1293600828817655e-06, + "loss": 0.5663, + "step": 3471 + }, + { + "epoch": 0.44, + "grad_norm": 1.8840426206588745, + "learning_rate": 6.127338582495796e-06, + "loss": 0.6317, + "step": 3472 + }, + { + "epoch": 0.44, + "grad_norm": 1.3639739751815796, + "learning_rate": 6.125316887948668e-06, + "loss": 0.6246, + "step": 3473 + }, + { + "epoch": 0.45, + "grad_norm": 1.087632179260254, + "learning_rate": 6.123294999588579e-06, + "loss": 0.5259, + "step": 3474 + }, + { + "epoch": 0.45, + "grad_norm": 1.6002978086471558, + "learning_rate": 6.1212729177637595e-06, + "loss": 0.5469, + "step": 3475 + }, + { + "epoch": 0.45, + "grad_norm": 1.169373869895935, + "learning_rate": 6.119250642822469e-06, + "loss": 0.646, + "step": 3476 + }, + { + "epoch": 0.45, + "grad_norm": 2.6361258029937744, + "learning_rate": 6.117228175113006e-06, + "loss": 0.6835, + "step": 3477 + }, + { + "epoch": 0.45, + "grad_norm": 1.1000622510910034, + "learning_rate": 6.115205514983699e-06, + "loss": 0.621, + "step": 3478 + }, + { + "epoch": 0.45, + "grad_norm": 1.3290541172027588, + "learning_rate": 6.1131826627829085e-06, + "loss": 0.6461, + "step": 3479 + }, + { + "epoch": 0.45, + "grad_norm": 1.3654758930206299, + "learning_rate": 6.11115961885903e-06, + "loss": 0.6303, + "step": 3480 + }, + { + "epoch": 0.45, + "grad_norm": 1.3180681467056274, + "learning_rate": 6.109136383560494e-06, + "loss": 0.6891, + "step": 3481 + }, + { + "epoch": 0.45, + "grad_norm": 1.4984737634658813, + "learning_rate": 6.107112957235759e-06, + "loss": 0.6644, + "step": 3482 + }, + { + "epoch": 0.45, + "grad_norm": 1.4471309185028076, + "learning_rate": 6.10508934023332e-06, + "loss": 0.5985, + "step": 3483 + }, + { + "epoch": 0.45, + "grad_norm": 1.3030848503112793, + "learning_rate": 6.103065532901704e-06, + "loss": 0.7308, + "step": 3484 + }, + { + "epoch": 0.45, + "grad_norm": 1.357759714126587, + "learning_rate": 6.101041535589469e-06, + "loss": 0.6637, + "step": 3485 + }, + { + "epoch": 0.45, + "grad_norm": 1.5871968269348145, + "learning_rate": 6.099017348645212e-06, + "loss": 0.6919, + "step": 3486 + }, + { + "epoch": 0.45, + "grad_norm": 1.0771772861480713, + "learning_rate": 6.096992972417552e-06, + "loss": 0.6352, + "step": 3487 + }, + { + "epoch": 0.45, + "grad_norm": 1.3609187602996826, + "learning_rate": 6.094968407255153e-06, + "loss": 0.6257, + "step": 3488 + }, + { + "epoch": 0.45, + "grad_norm": 1.1798498630523682, + "learning_rate": 6.092943653506701e-06, + "loss": 0.59, + "step": 3489 + }, + { + "epoch": 0.45, + "grad_norm": 1.488000512123108, + "learning_rate": 6.090918711520918e-06, + "loss": 0.6466, + "step": 3490 + }, + { + "epoch": 0.45, + "grad_norm": 1.2035020589828491, + "learning_rate": 6.088893581646562e-06, + "loss": 0.5723, + "step": 3491 + }, + { + "epoch": 0.45, + "grad_norm": 1.004976749420166, + "learning_rate": 6.086868264232418e-06, + "loss": 0.6055, + "step": 3492 + }, + { + "epoch": 0.45, + "grad_norm": 2.643667697906494, + "learning_rate": 6.084842759627309e-06, + "loss": 0.5617, + "step": 3493 + }, + { + "epoch": 0.45, + "grad_norm": 2.7203829288482666, + "learning_rate": 6.082817068180081e-06, + "loss": 0.6339, + "step": 3494 + }, + { + "epoch": 0.45, + "grad_norm": 1.1511204242706299, + "learning_rate": 6.0807911902396255e-06, + "loss": 0.6117, + "step": 3495 + }, + { + "epoch": 0.45, + "grad_norm": 1.4033151865005493, + "learning_rate": 6.078765126154854e-06, + "loss": 0.5826, + "step": 3496 + }, + { + "epoch": 0.45, + "grad_norm": 1.21249520778656, + "learning_rate": 6.076738876274718e-06, + "loss": 0.6407, + "step": 3497 + }, + { + "epoch": 0.45, + "grad_norm": 1.0692471265792847, + "learning_rate": 6.074712440948194e-06, + "loss": 0.586, + "step": 3498 + }, + { + "epoch": 0.45, + "grad_norm": 1.459241509437561, + "learning_rate": 6.0726858205242974e-06, + "loss": 0.581, + "step": 3499 + }, + { + "epoch": 0.45, + "grad_norm": 1.218548059463501, + "learning_rate": 6.070659015352072e-06, + "loss": 0.5934, + "step": 3500 + }, + { + "epoch": 0.45, + "grad_norm": 1.2451094388961792, + "learning_rate": 6.068632025780592e-06, + "loss": 0.6442, + "step": 3501 + }, + { + "epoch": 0.45, + "grad_norm": 1.7080090045928955, + "learning_rate": 6.066604852158966e-06, + "loss": 0.6385, + "step": 3502 + }, + { + "epoch": 0.45, + "grad_norm": 1.4963022470474243, + "learning_rate": 6.064577494836334e-06, + "loss": 0.5995, + "step": 3503 + }, + { + "epoch": 0.45, + "grad_norm": 1.1907035112380981, + "learning_rate": 6.0625499541618686e-06, + "loss": 0.6659, + "step": 3504 + }, + { + "epoch": 0.45, + "grad_norm": 0.9348851442337036, + "learning_rate": 6.060522230484769e-06, + "loss": 0.5889, + "step": 3505 + }, + { + "epoch": 0.45, + "grad_norm": 1.1017441749572754, + "learning_rate": 6.058494324154272e-06, + "loss": 0.6732, + "step": 3506 + }, + { + "epoch": 0.45, + "grad_norm": 1.096999168395996, + "learning_rate": 6.056466235519641e-06, + "loss": 0.5974, + "step": 3507 + }, + { + "epoch": 0.45, + "grad_norm": 1.2315013408660889, + "learning_rate": 6.054437964930175e-06, + "loss": 0.6482, + "step": 3508 + }, + { + "epoch": 0.45, + "grad_norm": 1.2487025260925293, + "learning_rate": 6.052409512735202e-06, + "loss": 0.6511, + "step": 3509 + }, + { + "epoch": 0.45, + "grad_norm": 1.2520768642425537, + "learning_rate": 6.0503808792840825e-06, + "loss": 0.6488, + "step": 3510 + }, + { + "epoch": 0.45, + "grad_norm": 1.1143486499786377, + "learning_rate": 6.048352064926204e-06, + "loss": 0.6537, + "step": 3511 + }, + { + "epoch": 0.45, + "grad_norm": 1.371921181678772, + "learning_rate": 6.046323070010993e-06, + "loss": 0.5731, + "step": 3512 + }, + { + "epoch": 0.45, + "grad_norm": 2.219484806060791, + "learning_rate": 6.0442938948879006e-06, + "loss": 0.5992, + "step": 3513 + }, + { + "epoch": 0.45, + "grad_norm": 1.838356614112854, + "learning_rate": 6.04226453990641e-06, + "loss": 0.591, + "step": 3514 + }, + { + "epoch": 0.45, + "grad_norm": 1.2704648971557617, + "learning_rate": 6.040235005416037e-06, + "loss": 0.6266, + "step": 3515 + }, + { + "epoch": 0.45, + "grad_norm": 1.7678539752960205, + "learning_rate": 6.038205291766328e-06, + "loss": 0.6643, + "step": 3516 + }, + { + "epoch": 0.45, + "grad_norm": 1.7963787317276, + "learning_rate": 6.036175399306861e-06, + "loss": 0.628, + "step": 3517 + }, + { + "epoch": 0.45, + "grad_norm": 1.342311143875122, + "learning_rate": 6.034145328387239e-06, + "loss": 0.6382, + "step": 3518 + }, + { + "epoch": 0.45, + "grad_norm": 1.220090627670288, + "learning_rate": 6.032115079357108e-06, + "loss": 0.5598, + "step": 3519 + }, + { + "epoch": 0.45, + "grad_norm": 1.0853689908981323, + "learning_rate": 6.0300846525661305e-06, + "loss": 0.5895, + "step": 3520 + }, + { + "epoch": 0.45, + "grad_norm": 1.2728458642959595, + "learning_rate": 6.028054048364011e-06, + "loss": 0.645, + "step": 3521 + }, + { + "epoch": 0.45, + "grad_norm": 1.1695315837860107, + "learning_rate": 6.026023267100474e-06, + "loss": 0.5413, + "step": 3522 + }, + { + "epoch": 0.45, + "grad_norm": 1.3852046728134155, + "learning_rate": 6.023992309125288e-06, + "loss": 0.6413, + "step": 3523 + }, + { + "epoch": 0.45, + "grad_norm": 5.460791110992432, + "learning_rate": 6.021961174788239e-06, + "loss": 0.6022, + "step": 3524 + }, + { + "epoch": 0.45, + "grad_norm": 1.5186138153076172, + "learning_rate": 6.019929864439149e-06, + "loss": 0.6042, + "step": 3525 + }, + { + "epoch": 0.45, + "grad_norm": 1.498050332069397, + "learning_rate": 6.017898378427871e-06, + "loss": 0.7024, + "step": 3526 + }, + { + "epoch": 0.45, + "grad_norm": 1.3453694581985474, + "learning_rate": 6.015866717104288e-06, + "loss": 0.6183, + "step": 3527 + }, + { + "epoch": 0.45, + "grad_norm": 1.3376102447509766, + "learning_rate": 6.01383488081831e-06, + "loss": 0.6636, + "step": 3528 + }, + { + "epoch": 0.45, + "grad_norm": 1.0858659744262695, + "learning_rate": 6.01180286991988e-06, + "loss": 0.7165, + "step": 3529 + }, + { + "epoch": 0.45, + "grad_norm": 1.561975121498108, + "learning_rate": 6.009770684758973e-06, + "loss": 0.6641, + "step": 3530 + }, + { + "epoch": 0.45, + "grad_norm": 1.422297716140747, + "learning_rate": 6.007738325685588e-06, + "loss": 0.6278, + "step": 3531 + }, + { + "epoch": 0.45, + "grad_norm": 1.3532339334487915, + "learning_rate": 6.005705793049762e-06, + "loss": 0.5749, + "step": 3532 + }, + { + "epoch": 0.45, + "grad_norm": 1.1300181150436401, + "learning_rate": 6.0036730872015524e-06, + "loss": 0.6098, + "step": 3533 + }, + { + "epoch": 0.45, + "grad_norm": 1.3219990730285645, + "learning_rate": 6.001640208491056e-06, + "loss": 0.5893, + "step": 3534 + }, + { + "epoch": 0.45, + "grad_norm": 2.5730981826782227, + "learning_rate": 5.9996071572683914e-06, + "loss": 0.553, + "step": 3535 + }, + { + "epoch": 0.45, + "grad_norm": 1.663468837738037, + "learning_rate": 5.997573933883714e-06, + "loss": 0.5648, + "step": 3536 + }, + { + "epoch": 0.45, + "grad_norm": 1.451084017753601, + "learning_rate": 5.995540538687199e-06, + "loss": 0.6901, + "step": 3537 + }, + { + "epoch": 0.45, + "grad_norm": 3.200803518295288, + "learning_rate": 5.993506972029064e-06, + "loss": 0.595, + "step": 3538 + }, + { + "epoch": 0.45, + "grad_norm": 1.3902348279953003, + "learning_rate": 5.991473234259546e-06, + "loss": 0.6697, + "step": 3539 + }, + { + "epoch": 0.45, + "grad_norm": 2.0828068256378174, + "learning_rate": 5.989439325728916e-06, + "loss": 0.6027, + "step": 3540 + }, + { + "epoch": 0.45, + "grad_norm": 1.1694917678833008, + "learning_rate": 5.987405246787474e-06, + "loss": 0.5812, + "step": 3541 + }, + { + "epoch": 0.45, + "grad_norm": 1.7300596237182617, + "learning_rate": 5.985370997785547e-06, + "loss": 0.5532, + "step": 3542 + }, + { + "epoch": 0.45, + "grad_norm": 1.1379985809326172, + "learning_rate": 5.983336579073495e-06, + "loss": 0.6126, + "step": 3543 + }, + { + "epoch": 0.45, + "grad_norm": 1.147905945777893, + "learning_rate": 5.981301991001703e-06, + "loss": 0.5544, + "step": 3544 + }, + { + "epoch": 0.45, + "grad_norm": 1.108622431755066, + "learning_rate": 5.979267233920589e-06, + "loss": 0.5836, + "step": 3545 + }, + { + "epoch": 0.45, + "grad_norm": 1.3943743705749512, + "learning_rate": 5.9772323081805985e-06, + "loss": 0.6146, + "step": 3546 + }, + { + "epoch": 0.45, + "grad_norm": 1.359615683555603, + "learning_rate": 5.975197214132207e-06, + "loss": 0.6164, + "step": 3547 + }, + { + "epoch": 0.45, + "grad_norm": 1.324974775314331, + "learning_rate": 5.9731619521259156e-06, + "loss": 0.6021, + "step": 3548 + }, + { + "epoch": 0.45, + "grad_norm": 1.6578129529953003, + "learning_rate": 5.97112652251226e-06, + "loss": 0.5934, + "step": 3549 + }, + { + "epoch": 0.45, + "grad_norm": 1.3419653177261353, + "learning_rate": 5.969090925641799e-06, + "loss": 0.7274, + "step": 3550 + }, + { + "epoch": 0.45, + "grad_norm": 1.1800451278686523, + "learning_rate": 5.9670551618651225e-06, + "loss": 0.7183, + "step": 3551 + }, + { + "epoch": 0.46, + "grad_norm": 1.0364329814910889, + "learning_rate": 5.96501923153285e-06, + "loss": 0.5909, + "step": 3552 + }, + { + "epoch": 0.46, + "grad_norm": 1.199339509010315, + "learning_rate": 5.9629831349956325e-06, + "loss": 0.5893, + "step": 3553 + }, + { + "epoch": 0.46, + "grad_norm": 1.2047299146652222, + "learning_rate": 5.960946872604141e-06, + "loss": 0.6378, + "step": 3554 + }, + { + "epoch": 0.46, + "grad_norm": 1.2108663320541382, + "learning_rate": 5.958910444709083e-06, + "loss": 0.5957, + "step": 3555 + }, + { + "epoch": 0.46, + "grad_norm": 1.2545571327209473, + "learning_rate": 5.956873851661192e-06, + "loss": 0.5564, + "step": 3556 + }, + { + "epoch": 0.46, + "grad_norm": 1.1796643733978271, + "learning_rate": 5.954837093811229e-06, + "loss": 0.6311, + "step": 3557 + }, + { + "epoch": 0.46, + "grad_norm": 1.4320603609085083, + "learning_rate": 5.952800171509985e-06, + "loss": 0.6326, + "step": 3558 + }, + { + "epoch": 0.46, + "grad_norm": 1.1937462091445923, + "learning_rate": 5.950763085108277e-06, + "loss": 0.6688, + "step": 3559 + }, + { + "epoch": 0.46, + "grad_norm": 1.2120587825775146, + "learning_rate": 5.948725834956952e-06, + "loss": 0.6689, + "step": 3560 + }, + { + "epoch": 0.46, + "grad_norm": 1.330200433731079, + "learning_rate": 5.946688421406886e-06, + "loss": 0.5936, + "step": 3561 + }, + { + "epoch": 0.46, + "grad_norm": 1.3225212097167969, + "learning_rate": 5.9446508448089825e-06, + "loss": 0.6483, + "step": 3562 + }, + { + "epoch": 0.46, + "grad_norm": 1.0410070419311523, + "learning_rate": 5.942613105514171e-06, + "loss": 0.5963, + "step": 3563 + }, + { + "epoch": 0.46, + "grad_norm": 1.1120356321334839, + "learning_rate": 5.940575203873411e-06, + "loss": 0.6553, + "step": 3564 + }, + { + "epoch": 0.46, + "grad_norm": 1.1769014596939087, + "learning_rate": 5.9385371402376914e-06, + "loss": 0.6784, + "step": 3565 + }, + { + "epoch": 0.46, + "grad_norm": 2.005772590637207, + "learning_rate": 5.936498914958025e-06, + "loss": 0.6325, + "step": 3566 + }, + { + "epoch": 0.46, + "grad_norm": 1.3367499113082886, + "learning_rate": 5.934460528385456e-06, + "loss": 0.6275, + "step": 3567 + }, + { + "epoch": 0.46, + "grad_norm": 1.1705892086029053, + "learning_rate": 5.932421980871054e-06, + "loss": 0.5585, + "step": 3568 + }, + { + "epoch": 0.46, + "grad_norm": 1.4280080795288086, + "learning_rate": 5.930383272765918e-06, + "loss": 0.5989, + "step": 3569 + }, + { + "epoch": 0.46, + "grad_norm": 1.7109063863754272, + "learning_rate": 5.928344404421174e-06, + "loss": 0.5889, + "step": 3570 + }, + { + "epoch": 0.46, + "grad_norm": 1.2693599462509155, + "learning_rate": 5.926305376187976e-06, + "loss": 0.6736, + "step": 3571 + }, + { + "epoch": 0.46, + "grad_norm": 1.3794386386871338, + "learning_rate": 5.924266188417503e-06, + "loss": 0.5768, + "step": 3572 + }, + { + "epoch": 0.46, + "grad_norm": 1.2329838275909424, + "learning_rate": 5.922226841460968e-06, + "loss": 0.6015, + "step": 3573 + }, + { + "epoch": 0.46, + "grad_norm": 1.6483304500579834, + "learning_rate": 5.920187335669602e-06, + "loss": 0.6534, + "step": 3574 + }, + { + "epoch": 0.46, + "grad_norm": 1.2386940717697144, + "learning_rate": 5.918147671394674e-06, + "loss": 0.5888, + "step": 3575 + }, + { + "epoch": 0.46, + "grad_norm": 1.224981427192688, + "learning_rate": 5.91610784898747e-06, + "loss": 0.525, + "step": 3576 + }, + { + "epoch": 0.46, + "grad_norm": 1.332261085510254, + "learning_rate": 5.91406786879931e-06, + "loss": 0.5301, + "step": 3577 + }, + { + "epoch": 0.46, + "grad_norm": 1.2706077098846436, + "learning_rate": 5.912027731181539e-06, + "loss": 0.5354, + "step": 3578 + }, + { + "epoch": 0.46, + "grad_norm": 1.3086779117584229, + "learning_rate": 5.909987436485527e-06, + "loss": 0.5638, + "step": 3579 + }, + { + "epoch": 0.46, + "grad_norm": 2.0201406478881836, + "learning_rate": 5.907946985062678e-06, + "loss": 0.6415, + "step": 3580 + }, + { + "epoch": 0.46, + "grad_norm": 1.47654390335083, + "learning_rate": 5.905906377264413e-06, + "loss": 0.6439, + "step": 3581 + }, + { + "epoch": 0.46, + "grad_norm": 1.4309604167938232, + "learning_rate": 5.90386561344219e-06, + "loss": 0.6235, + "step": 3582 + }, + { + "epoch": 0.46, + "grad_norm": 1.3017194271087646, + "learning_rate": 5.901824693947486e-06, + "loss": 0.6122, + "step": 3583 + }, + { + "epoch": 0.46, + "grad_norm": 1.1456342935562134, + "learning_rate": 5.899783619131809e-06, + "loss": 0.5598, + "step": 3584 + }, + { + "epoch": 0.46, + "grad_norm": 2.045018196105957, + "learning_rate": 5.897742389346691e-06, + "loss": 0.6183, + "step": 3585 + }, + { + "epoch": 0.46, + "grad_norm": 1.2723512649536133, + "learning_rate": 5.8957010049436955e-06, + "loss": 0.5801, + "step": 3586 + }, + { + "epoch": 0.46, + "grad_norm": 1.087829828262329, + "learning_rate": 5.893659466274407e-06, + "loss": 0.5959, + "step": 3587 + }, + { + "epoch": 0.46, + "grad_norm": 1.7654427289962769, + "learning_rate": 5.891617773690439e-06, + "loss": 0.6837, + "step": 3588 + }, + { + "epoch": 0.46, + "grad_norm": 1.581198811531067, + "learning_rate": 5.8895759275434326e-06, + "loss": 0.5811, + "step": 3589 + }, + { + "epoch": 0.46, + "grad_norm": 1.239762544631958, + "learning_rate": 5.887533928185054e-06, + "loss": 0.5911, + "step": 3590 + }, + { + "epoch": 0.46, + "grad_norm": 1.2970668077468872, + "learning_rate": 5.8854917759669975e-06, + "loss": 0.6369, + "step": 3591 + }, + { + "epoch": 0.46, + "grad_norm": 1.130383014678955, + "learning_rate": 5.88344947124098e-06, + "loss": 0.576, + "step": 3592 + }, + { + "epoch": 0.46, + "grad_norm": 1.180657148361206, + "learning_rate": 5.881407014358748e-06, + "loss": 0.5828, + "step": 3593 + }, + { + "epoch": 0.46, + "grad_norm": 1.4534460306167603, + "learning_rate": 5.879364405672072e-06, + "loss": 0.6106, + "step": 3594 + }, + { + "epoch": 0.46, + "grad_norm": 1.3066177368164062, + "learning_rate": 5.877321645532752e-06, + "loss": 0.6507, + "step": 3595 + }, + { + "epoch": 0.46, + "grad_norm": 1.6938061714172363, + "learning_rate": 5.87527873429261e-06, + "loss": 0.6446, + "step": 3596 + }, + { + "epoch": 0.46, + "grad_norm": 2.4884603023529053, + "learning_rate": 5.873235672303497e-06, + "loss": 0.6266, + "step": 3597 + }, + { + "epoch": 0.46, + "grad_norm": 1.3594398498535156, + "learning_rate": 5.871192459917287e-06, + "loss": 0.5882, + "step": 3598 + }, + { + "epoch": 0.46, + "grad_norm": 1.6385754346847534, + "learning_rate": 5.869149097485887e-06, + "loss": 0.7136, + "step": 3599 + }, + { + "epoch": 0.46, + "grad_norm": 1.3254238367080688, + "learning_rate": 5.867105585361218e-06, + "loss": 0.5929, + "step": 3600 + }, + { + "epoch": 0.46, + "grad_norm": 1.1824506521224976, + "learning_rate": 5.865061923895238e-06, + "loss": 0.6646, + "step": 3601 + }, + { + "epoch": 0.46, + "grad_norm": 1.4841578006744385, + "learning_rate": 5.863018113439925e-06, + "loss": 0.6157, + "step": 3602 + }, + { + "epoch": 0.46, + "grad_norm": 1.1699411869049072, + "learning_rate": 5.860974154347282e-06, + "loss": 0.5475, + "step": 3603 + }, + { + "epoch": 0.46, + "grad_norm": 1.5281282663345337, + "learning_rate": 5.858930046969341e-06, + "loss": 0.6103, + "step": 3604 + }, + { + "epoch": 0.46, + "grad_norm": 1.1899720430374146, + "learning_rate": 5.856885791658158e-06, + "loss": 0.6138, + "step": 3605 + }, + { + "epoch": 0.46, + "grad_norm": 1.1821370124816895, + "learning_rate": 5.854841388765816e-06, + "loss": 0.583, + "step": 3606 + }, + { + "epoch": 0.46, + "grad_norm": 2.455402135848999, + "learning_rate": 5.8527968386444174e-06, + "loss": 0.6418, + "step": 3607 + }, + { + "epoch": 0.46, + "grad_norm": 1.7864829301834106, + "learning_rate": 5.8507521416461e-06, + "loss": 0.5578, + "step": 3608 + }, + { + "epoch": 0.46, + "grad_norm": 1.501908540725708, + "learning_rate": 5.848707298123017e-06, + "loss": 0.6361, + "step": 3609 + }, + { + "epoch": 0.46, + "grad_norm": 1.2245491743087769, + "learning_rate": 5.846662308427352e-06, + "loss": 0.6692, + "step": 3610 + }, + { + "epoch": 0.46, + "grad_norm": 1.385204792022705, + "learning_rate": 5.844617172911313e-06, + "loss": 0.639, + "step": 3611 + }, + { + "epoch": 0.46, + "grad_norm": 1.3009589910507202, + "learning_rate": 5.8425718919271356e-06, + "loss": 0.6582, + "step": 3612 + }, + { + "epoch": 0.46, + "grad_norm": 1.1194837093353271, + "learning_rate": 5.8405264658270735e-06, + "loss": 0.5595, + "step": 3613 + }, + { + "epoch": 0.46, + "grad_norm": 1.1085587739944458, + "learning_rate": 5.838480894963412e-06, + "loss": 0.5775, + "step": 3614 + }, + { + "epoch": 0.46, + "grad_norm": 1.2732248306274414, + "learning_rate": 5.836435179688461e-06, + "loss": 0.5688, + "step": 3615 + }, + { + "epoch": 0.46, + "grad_norm": 1.1878724098205566, + "learning_rate": 5.83438932035455e-06, + "loss": 0.6376, + "step": 3616 + }, + { + "epoch": 0.46, + "grad_norm": 1.7219138145446777, + "learning_rate": 5.8323433173140395e-06, + "loss": 0.6421, + "step": 3617 + }, + { + "epoch": 0.46, + "grad_norm": 1.5152086019515991, + "learning_rate": 5.830297170919309e-06, + "loss": 0.6828, + "step": 3618 + }, + { + "epoch": 0.46, + "grad_norm": 1.371113657951355, + "learning_rate": 5.828250881522769e-06, + "loss": 0.6614, + "step": 3619 + }, + { + "epoch": 0.46, + "grad_norm": 1.0953738689422607, + "learning_rate": 5.826204449476848e-06, + "loss": 0.6094, + "step": 3620 + }, + { + "epoch": 0.46, + "grad_norm": 1.253603219985962, + "learning_rate": 5.824157875134005e-06, + "loss": 0.6148, + "step": 3621 + }, + { + "epoch": 0.46, + "grad_norm": 1.2168421745300293, + "learning_rate": 5.822111158846718e-06, + "loss": 0.6631, + "step": 3622 + }, + { + "epoch": 0.46, + "grad_norm": 1.5407350063323975, + "learning_rate": 5.820064300967494e-06, + "loss": 0.6399, + "step": 3623 + }, + { + "epoch": 0.46, + "grad_norm": 1.245928168296814, + "learning_rate": 5.8180173018488625e-06, + "loss": 0.4972, + "step": 3624 + }, + { + "epoch": 0.46, + "grad_norm": 1.1214627027511597, + "learning_rate": 5.815970161843379e-06, + "loss": 0.5604, + "step": 3625 + }, + { + "epoch": 0.46, + "grad_norm": 1.016115427017212, + "learning_rate": 5.81392288130362e-06, + "loss": 0.6543, + "step": 3626 + }, + { + "epoch": 0.46, + "grad_norm": 1.1197785139083862, + "learning_rate": 5.811875460582189e-06, + "loss": 0.6655, + "step": 3627 + }, + { + "epoch": 0.46, + "grad_norm": 1.4029167890548706, + "learning_rate": 5.80982790003171e-06, + "loss": 0.6326, + "step": 3628 + }, + { + "epoch": 0.46, + "grad_norm": 1.0858590602874756, + "learning_rate": 5.807780200004838e-06, + "loss": 0.6388, + "step": 3629 + }, + { + "epoch": 0.47, + "grad_norm": 1.1979694366455078, + "learning_rate": 5.805732360854243e-06, + "loss": 0.5826, + "step": 3630 + }, + { + "epoch": 0.47, + "grad_norm": 2.004387617111206, + "learning_rate": 5.803684382932626e-06, + "loss": 0.6769, + "step": 3631 + }, + { + "epoch": 0.47, + "grad_norm": 1.0634325742721558, + "learning_rate": 5.801636266592712e-06, + "loss": 0.629, + "step": 3632 + }, + { + "epoch": 0.47, + "grad_norm": 1.3099150657653809, + "learning_rate": 5.799588012187243e-06, + "loss": 0.6693, + "step": 3633 + }, + { + "epoch": 0.47, + "grad_norm": 1.3465301990509033, + "learning_rate": 5.797539620068992e-06, + "loss": 0.6112, + "step": 3634 + }, + { + "epoch": 0.47, + "grad_norm": 1.356348991394043, + "learning_rate": 5.79549109059075e-06, + "loss": 0.6714, + "step": 3635 + }, + { + "epoch": 0.47, + "grad_norm": 1.0583562850952148, + "learning_rate": 5.793442424105339e-06, + "loss": 0.5755, + "step": 3636 + }, + { + "epoch": 0.47, + "grad_norm": 1.416865348815918, + "learning_rate": 5.791393620965597e-06, + "loss": 0.6358, + "step": 3637 + }, + { + "epoch": 0.47, + "grad_norm": 1.24928617477417, + "learning_rate": 5.789344681524389e-06, + "loss": 0.6334, + "step": 3638 + }, + { + "epoch": 0.47, + "grad_norm": 1.223817229270935, + "learning_rate": 5.7872956061346045e-06, + "loss": 0.6467, + "step": 3639 + }, + { + "epoch": 0.47, + "grad_norm": 0.9584287405014038, + "learning_rate": 5.785246395149152e-06, + "loss": 0.5455, + "step": 3640 + }, + { + "epoch": 0.47, + "grad_norm": 0.9955974221229553, + "learning_rate": 5.783197048920972e-06, + "loss": 0.6464, + "step": 3641 + }, + { + "epoch": 0.47, + "grad_norm": 1.016220211982727, + "learning_rate": 5.781147567803017e-06, + "loss": 0.5736, + "step": 3642 + }, + { + "epoch": 0.47, + "grad_norm": 1.1931318044662476, + "learning_rate": 5.779097952148273e-06, + "loss": 0.6193, + "step": 3643 + }, + { + "epoch": 0.47, + "grad_norm": 1.3940238952636719, + "learning_rate": 5.77704820230974e-06, + "loss": 0.691, + "step": 3644 + }, + { + "epoch": 0.47, + "grad_norm": 1.2070850133895874, + "learning_rate": 5.77499831864045e-06, + "loss": 0.6053, + "step": 3645 + }, + { + "epoch": 0.47, + "grad_norm": 1.3859649896621704, + "learning_rate": 5.772948301493451e-06, + "loss": 0.6045, + "step": 3646 + }, + { + "epoch": 0.47, + "grad_norm": 1.1878433227539062, + "learning_rate": 5.770898151221819e-06, + "loss": 0.5722, + "step": 3647 + }, + { + "epoch": 0.47, + "grad_norm": 1.1642296314239502, + "learning_rate": 5.768847868178646e-06, + "loss": 0.6328, + "step": 3648 + }, + { + "epoch": 0.47, + "grad_norm": 8.618324279785156, + "learning_rate": 5.766797452717059e-06, + "loss": 0.6084, + "step": 3649 + }, + { + "epoch": 0.47, + "grad_norm": 1.2117552757263184, + "learning_rate": 5.764746905190195e-06, + "loss": 0.5757, + "step": 3650 + }, + { + "epoch": 0.47, + "grad_norm": 1.7341634035110474, + "learning_rate": 5.762696225951222e-06, + "loss": 0.6151, + "step": 3651 + }, + { + "epoch": 0.47, + "grad_norm": 1.1232070922851562, + "learning_rate": 5.760645415353325e-06, + "loss": 0.5258, + "step": 3652 + }, + { + "epoch": 0.47, + "grad_norm": 1.6535828113555908, + "learning_rate": 5.7585944737497165e-06, + "loss": 0.6518, + "step": 3653 + }, + { + "epoch": 0.47, + "grad_norm": 1.4652224779129028, + "learning_rate": 5.756543401493628e-06, + "loss": 0.6398, + "step": 3654 + }, + { + "epoch": 0.47, + "grad_norm": 1.3363951444625854, + "learning_rate": 5.754492198938318e-06, + "loss": 0.6407, + "step": 3655 + }, + { + "epoch": 0.47, + "grad_norm": 1.152831792831421, + "learning_rate": 5.752440866437062e-06, + "loss": 0.537, + "step": 3656 + }, + { + "epoch": 0.47, + "grad_norm": 1.4328360557556152, + "learning_rate": 5.750389404343159e-06, + "loss": 0.6318, + "step": 3657 + }, + { + "epoch": 0.47, + "grad_norm": 1.8562523126602173, + "learning_rate": 5.748337813009934e-06, + "loss": 0.5985, + "step": 3658 + }, + { + "epoch": 0.47, + "grad_norm": 0.9919341802597046, + "learning_rate": 5.74628609279073e-06, + "loss": 0.6107, + "step": 3659 + }, + { + "epoch": 0.47, + "grad_norm": 1.3245062828063965, + "learning_rate": 5.744234244038918e-06, + "loss": 0.526, + "step": 3660 + }, + { + "epoch": 0.47, + "grad_norm": 1.3959466218948364, + "learning_rate": 5.742182267107883e-06, + "loss": 0.6278, + "step": 3661 + }, + { + "epoch": 0.47, + "grad_norm": 1.1084336042404175, + "learning_rate": 5.740130162351039e-06, + "loss": 0.5771, + "step": 3662 + }, + { + "epoch": 0.47, + "grad_norm": 1.2701683044433594, + "learning_rate": 5.738077930121817e-06, + "loss": 0.5676, + "step": 3663 + }, + { + "epoch": 0.47, + "grad_norm": 1.0271109342575073, + "learning_rate": 5.736025570773675e-06, + "loss": 0.6557, + "step": 3664 + }, + { + "epoch": 0.47, + "grad_norm": 1.5346267223358154, + "learning_rate": 5.7339730846600875e-06, + "loss": 0.4982, + "step": 3665 + }, + { + "epoch": 0.47, + "grad_norm": 1.2214652299880981, + "learning_rate": 5.7319204721345536e-06, + "loss": 0.6193, + "step": 3666 + }, + { + "epoch": 0.47, + "grad_norm": 1.1910988092422485, + "learning_rate": 5.729867733550597e-06, + "loss": 0.5806, + "step": 3667 + }, + { + "epoch": 0.47, + "grad_norm": 1.1290035247802734, + "learning_rate": 5.727814869261758e-06, + "loss": 0.611, + "step": 3668 + }, + { + "epoch": 0.47, + "grad_norm": 1.1266953945159912, + "learning_rate": 5.725761879621601e-06, + "loss": 0.628, + "step": 3669 + }, + { + "epoch": 0.47, + "grad_norm": 1.3203330039978027, + "learning_rate": 5.723708764983712e-06, + "loss": 0.6415, + "step": 3670 + }, + { + "epoch": 0.47, + "grad_norm": 1.3828309774398804, + "learning_rate": 5.721655525701699e-06, + "loss": 0.5446, + "step": 3671 + }, + { + "epoch": 0.47, + "grad_norm": 1.2213470935821533, + "learning_rate": 5.719602162129189e-06, + "loss": 0.6274, + "step": 3672 + }, + { + "epoch": 0.47, + "grad_norm": 1.2695316076278687, + "learning_rate": 5.7175486746198325e-06, + "loss": 0.58, + "step": 3673 + }, + { + "epoch": 0.47, + "grad_norm": 1.0185368061065674, + "learning_rate": 5.715495063527301e-06, + "loss": 0.6232, + "step": 3674 + }, + { + "epoch": 0.47, + "grad_norm": 1.2112317085266113, + "learning_rate": 5.713441329205289e-06, + "loss": 0.6422, + "step": 3675 + }, + { + "epoch": 0.47, + "grad_norm": 1.6289145946502686, + "learning_rate": 5.711387472007509e-06, + "loss": 0.6632, + "step": 3676 + }, + { + "epoch": 0.47, + "grad_norm": 1.5209150314331055, + "learning_rate": 5.709333492287698e-06, + "loss": 0.6227, + "step": 3677 + }, + { + "epoch": 0.47, + "grad_norm": 1.4312928915023804, + "learning_rate": 5.7072793903996085e-06, + "loss": 0.66, + "step": 3678 + }, + { + "epoch": 0.47, + "grad_norm": 1.2010070085525513, + "learning_rate": 5.705225166697022e-06, + "loss": 0.6936, + "step": 3679 + }, + { + "epoch": 0.47, + "grad_norm": 1.0900729894638062, + "learning_rate": 5.703170821533733e-06, + "loss": 0.6763, + "step": 3680 + }, + { + "epoch": 0.47, + "grad_norm": 1.209028720855713, + "learning_rate": 5.701116355263563e-06, + "loss": 0.5245, + "step": 3681 + }, + { + "epoch": 0.47, + "grad_norm": 1.0546902418136597, + "learning_rate": 5.69906176824035e-06, + "loss": 0.5515, + "step": 3682 + }, + { + "epoch": 0.47, + "grad_norm": 1.3346909284591675, + "learning_rate": 5.697007060817958e-06, + "loss": 0.57, + "step": 3683 + }, + { + "epoch": 0.47, + "grad_norm": 1.3135395050048828, + "learning_rate": 5.694952233350268e-06, + "loss": 0.5915, + "step": 3684 + }, + { + "epoch": 0.47, + "grad_norm": 1.2499960660934448, + "learning_rate": 5.692897286191179e-06, + "loss": 0.5783, + "step": 3685 + }, + { + "epoch": 0.47, + "grad_norm": 1.2707788944244385, + "learning_rate": 5.690842219694619e-06, + "loss": 0.6796, + "step": 3686 + }, + { + "epoch": 0.47, + "grad_norm": 1.1295020580291748, + "learning_rate": 5.688787034214525e-06, + "loss": 0.6514, + "step": 3687 + }, + { + "epoch": 0.47, + "grad_norm": 1.445489764213562, + "learning_rate": 5.6867317301048676e-06, + "loss": 0.7057, + "step": 3688 + }, + { + "epoch": 0.47, + "grad_norm": 1.3477917909622192, + "learning_rate": 5.684676307719626e-06, + "loss": 0.5777, + "step": 3689 + }, + { + "epoch": 0.47, + "grad_norm": 1.2112998962402344, + "learning_rate": 5.682620767412808e-06, + "loss": 0.6103, + "step": 3690 + }, + { + "epoch": 0.47, + "grad_norm": 1.5050208568572998, + "learning_rate": 5.6805651095384375e-06, + "loss": 0.6307, + "step": 3691 + }, + { + "epoch": 0.47, + "grad_norm": 1.221442461013794, + "learning_rate": 5.678509334450559e-06, + "loss": 0.5603, + "step": 3692 + }, + { + "epoch": 0.47, + "grad_norm": 1.4392536878585815, + "learning_rate": 5.67645344250324e-06, + "loss": 0.6022, + "step": 3693 + }, + { + "epoch": 0.47, + "grad_norm": 1.1971774101257324, + "learning_rate": 5.6743974340505645e-06, + "loss": 0.5572, + "step": 3694 + }, + { + "epoch": 0.47, + "grad_norm": 1.7652573585510254, + "learning_rate": 5.672341309446639e-06, + "loss": 0.6265, + "step": 3695 + }, + { + "epoch": 0.47, + "grad_norm": 1.4171807765960693, + "learning_rate": 5.670285069045588e-06, + "loss": 0.6642, + "step": 3696 + }, + { + "epoch": 0.47, + "grad_norm": 1.8250502347946167, + "learning_rate": 5.668228713201559e-06, + "loss": 0.6942, + "step": 3697 + }, + { + "epoch": 0.47, + "grad_norm": 1.0191811323165894, + "learning_rate": 5.666172242268715e-06, + "loss": 0.5493, + "step": 3698 + }, + { + "epoch": 0.47, + "grad_norm": 1.2496845722198486, + "learning_rate": 5.664115656601243e-06, + "loss": 0.6399, + "step": 3699 + }, + { + "epoch": 0.47, + "grad_norm": 1.2776678800582886, + "learning_rate": 5.662058956553348e-06, + "loss": 0.6093, + "step": 3700 + }, + { + "epoch": 0.47, + "grad_norm": 1.5822676420211792, + "learning_rate": 5.660002142479255e-06, + "loss": 0.5506, + "step": 3701 + }, + { + "epoch": 0.47, + "grad_norm": 1.53562593460083, + "learning_rate": 5.657945214733208e-06, + "loss": 0.6304, + "step": 3702 + }, + { + "epoch": 0.47, + "grad_norm": 1.2964261770248413, + "learning_rate": 5.655888173669472e-06, + "loss": 0.6556, + "step": 3703 + }, + { + "epoch": 0.47, + "grad_norm": 1.3869433403015137, + "learning_rate": 5.653831019642327e-06, + "loss": 0.7004, + "step": 3704 + }, + { + "epoch": 0.47, + "grad_norm": 1.1649458408355713, + "learning_rate": 5.6517737530060815e-06, + "loss": 0.5943, + "step": 3705 + }, + { + "epoch": 0.47, + "grad_norm": 1.3618543148040771, + "learning_rate": 5.649716374115053e-06, + "loss": 0.6762, + "step": 3706 + }, + { + "epoch": 0.47, + "grad_norm": 1.0227161645889282, + "learning_rate": 5.6476588833235846e-06, + "loss": 0.5979, + "step": 3707 + }, + { + "epoch": 0.48, + "grad_norm": 1.4849395751953125, + "learning_rate": 5.645601280986038e-06, + "loss": 0.5881, + "step": 3708 + }, + { + "epoch": 0.48, + "grad_norm": 1.639639139175415, + "learning_rate": 5.643543567456793e-06, + "loss": 0.5817, + "step": 3709 + }, + { + "epoch": 0.48, + "grad_norm": 1.9866191148757935, + "learning_rate": 5.64148574309025e-06, + "loss": 0.6601, + "step": 3710 + }, + { + "epoch": 0.48, + "grad_norm": 1.609009027481079, + "learning_rate": 5.639427808240825e-06, + "loss": 0.5858, + "step": 3711 + }, + { + "epoch": 0.48, + "grad_norm": 1.3619134426116943, + "learning_rate": 5.637369763262959e-06, + "loss": 0.7077, + "step": 3712 + }, + { + "epoch": 0.48, + "grad_norm": 1.349634051322937, + "learning_rate": 5.635311608511105e-06, + "loss": 0.6249, + "step": 3713 + }, + { + "epoch": 0.48, + "grad_norm": 1.1673775911331177, + "learning_rate": 5.63325334433974e-06, + "loss": 0.6222, + "step": 3714 + }, + { + "epoch": 0.48, + "grad_norm": 1.4045964479446411, + "learning_rate": 5.631194971103357e-06, + "loss": 0.5758, + "step": 3715 + }, + { + "epoch": 0.48, + "grad_norm": 1.5967957973480225, + "learning_rate": 5.6291364891564704e-06, + "loss": 0.6434, + "step": 3716 + }, + { + "epoch": 0.48, + "grad_norm": 1.5817066431045532, + "learning_rate": 5.62707789885361e-06, + "loss": 0.6438, + "step": 3717 + }, + { + "epoch": 0.48, + "grad_norm": 1.5195118188858032, + "learning_rate": 5.6250192005493285e-06, + "loss": 0.6614, + "step": 3718 + }, + { + "epoch": 0.48, + "grad_norm": 1.3318872451782227, + "learning_rate": 5.622960394598194e-06, + "loss": 0.598, + "step": 3719 + }, + { + "epoch": 0.48, + "grad_norm": 1.229845643043518, + "learning_rate": 5.620901481354792e-06, + "loss": 0.5759, + "step": 3720 + }, + { + "epoch": 0.48, + "grad_norm": 1.2453598976135254, + "learning_rate": 5.6188424611737325e-06, + "loss": 0.671, + "step": 3721 + }, + { + "epoch": 0.48, + "grad_norm": 1.5366243124008179, + "learning_rate": 5.616783334409637e-06, + "loss": 0.7277, + "step": 3722 + }, + { + "epoch": 0.48, + "grad_norm": 1.2466989755630493, + "learning_rate": 5.614724101417148e-06, + "loss": 0.6157, + "step": 3723 + }, + { + "epoch": 0.48, + "grad_norm": 1.4630953073501587, + "learning_rate": 5.612664762550927e-06, + "loss": 0.6182, + "step": 3724 + }, + { + "epoch": 0.48, + "grad_norm": 1.7095898389816284, + "learning_rate": 5.6106053181656535e-06, + "loss": 0.6675, + "step": 3725 + }, + { + "epoch": 0.48, + "grad_norm": 1.4824260473251343, + "learning_rate": 5.608545768616025e-06, + "loss": 0.5571, + "step": 3726 + }, + { + "epoch": 0.48, + "grad_norm": 1.4626764059066772, + "learning_rate": 5.6064861142567575e-06, + "loss": 0.6716, + "step": 3727 + }, + { + "epoch": 0.48, + "grad_norm": 1.6166949272155762, + "learning_rate": 5.604426355442584e-06, + "loss": 0.6997, + "step": 3728 + }, + { + "epoch": 0.48, + "grad_norm": 1.882332444190979, + "learning_rate": 5.602366492528256e-06, + "loss": 0.6062, + "step": 3729 + }, + { + "epoch": 0.48, + "grad_norm": 1.456566333770752, + "learning_rate": 5.6003065258685444e-06, + "loss": 0.5865, + "step": 3730 + }, + { + "epoch": 0.48, + "grad_norm": 1.3899471759796143, + "learning_rate": 5.5982464558182335e-06, + "loss": 0.5982, + "step": 3731 + }, + { + "epoch": 0.48, + "grad_norm": 1.3733237981796265, + "learning_rate": 5.596186282732132e-06, + "loss": 0.6052, + "step": 3732 + }, + { + "epoch": 0.48, + "grad_norm": 1.0841834545135498, + "learning_rate": 5.59412600696506e-06, + "loss": 0.5684, + "step": 3733 + }, + { + "epoch": 0.48, + "grad_norm": 1.2034820318222046, + "learning_rate": 5.59206562887186e-06, + "loss": 0.5606, + "step": 3734 + }, + { + "epoch": 0.48, + "grad_norm": 1.2253649234771729, + "learning_rate": 5.5900051488073896e-06, + "loss": 0.5615, + "step": 3735 + }, + { + "epoch": 0.48, + "grad_norm": 2.54776930809021, + "learning_rate": 5.587944567126525e-06, + "loss": 0.6376, + "step": 3736 + }, + { + "epoch": 0.48, + "grad_norm": 1.1381703615188599, + "learning_rate": 5.585883884184158e-06, + "loss": 0.594, + "step": 3737 + }, + { + "epoch": 0.48, + "grad_norm": 1.3217531442642212, + "learning_rate": 5.583823100335202e-06, + "loss": 0.7467, + "step": 3738 + }, + { + "epoch": 0.48, + "grad_norm": 1.229705810546875, + "learning_rate": 5.581762215934582e-06, + "loss": 0.4965, + "step": 3739 + }, + { + "epoch": 0.48, + "grad_norm": 1.090378761291504, + "learning_rate": 5.579701231337247e-06, + "loss": 0.6012, + "step": 3740 + }, + { + "epoch": 0.48, + "grad_norm": 1.2407397031784058, + "learning_rate": 5.577640146898156e-06, + "loss": 0.6052, + "step": 3741 + }, + { + "epoch": 0.48, + "grad_norm": 2.4652295112609863, + "learning_rate": 5.57557896297229e-06, + "loss": 0.6058, + "step": 3742 + }, + { + "epoch": 0.48, + "grad_norm": 1.1912661790847778, + "learning_rate": 5.5735176799146486e-06, + "loss": 0.5969, + "step": 3743 + }, + { + "epoch": 0.48, + "grad_norm": 1.246437668800354, + "learning_rate": 5.571456298080243e-06, + "loss": 0.6003, + "step": 3744 + }, + { + "epoch": 0.48, + "grad_norm": 1.1998566389083862, + "learning_rate": 5.569394817824106e-06, + "loss": 0.612, + "step": 3745 + }, + { + "epoch": 0.48, + "grad_norm": 1.8297500610351562, + "learning_rate": 5.567333239501284e-06, + "loss": 0.5909, + "step": 3746 + }, + { + "epoch": 0.48, + "grad_norm": 1.1892491579055786, + "learning_rate": 5.565271563466845e-06, + "loss": 0.5661, + "step": 3747 + }, + { + "epoch": 0.48, + "grad_norm": 1.076907753944397, + "learning_rate": 5.5632097900758676e-06, + "loss": 0.5741, + "step": 3748 + }, + { + "epoch": 0.48, + "grad_norm": 1.0066630840301514, + "learning_rate": 5.561147919683451e-06, + "loss": 0.5962, + "step": 3749 + }, + { + "epoch": 0.48, + "grad_norm": 2.04477858543396, + "learning_rate": 5.559085952644711e-06, + "loss": 0.5698, + "step": 3750 + }, + { + "epoch": 0.48, + "grad_norm": 1.407403588294983, + "learning_rate": 5.5570238893147795e-06, + "loss": 0.6276, + "step": 3751 + }, + { + "epoch": 0.48, + "grad_norm": 1.328902244567871, + "learning_rate": 5.554961730048806e-06, + "loss": 0.579, + "step": 3752 + }, + { + "epoch": 0.48, + "grad_norm": 1.0437157154083252, + "learning_rate": 5.5528994752019535e-06, + "loss": 0.6284, + "step": 3753 + }, + { + "epoch": 0.48, + "grad_norm": 1.2737390995025635, + "learning_rate": 5.550837125129406e-06, + "loss": 0.6508, + "step": 3754 + }, + { + "epoch": 0.48, + "grad_norm": 1.1763293743133545, + "learning_rate": 5.548774680186359e-06, + "loss": 0.5689, + "step": 3755 + }, + { + "epoch": 0.48, + "grad_norm": 1.2019954919815063, + "learning_rate": 5.5467121407280275e-06, + "loss": 0.7561, + "step": 3756 + }, + { + "epoch": 0.48, + "grad_norm": 1.4074658155441284, + "learning_rate": 5.544649507109642e-06, + "loss": 0.6113, + "step": 3757 + }, + { + "epoch": 0.48, + "grad_norm": 1.317198395729065, + "learning_rate": 5.5425867796864496e-06, + "loss": 0.6282, + "step": 3758 + }, + { + "epoch": 0.48, + "grad_norm": 1.7032959461212158, + "learning_rate": 5.54052395881371e-06, + "loss": 0.6331, + "step": 3759 + }, + { + "epoch": 0.48, + "grad_norm": 1.3925243616104126, + "learning_rate": 5.5384610448467095e-06, + "loss": 0.563, + "step": 3760 + }, + { + "epoch": 0.48, + "grad_norm": 2.3951807022094727, + "learning_rate": 5.536398038140735e-06, + "loss": 0.596, + "step": 3761 + }, + { + "epoch": 0.48, + "grad_norm": 1.2922788858413696, + "learning_rate": 5.534334939051104e-06, + "loss": 0.5835, + "step": 3762 + }, + { + "epoch": 0.48, + "grad_norm": 1.4560922384262085, + "learning_rate": 5.53227174793314e-06, + "loss": 0.5856, + "step": 3763 + }, + { + "epoch": 0.48, + "grad_norm": 1.7121245861053467, + "learning_rate": 5.530208465142186e-06, + "loss": 0.5596, + "step": 3764 + }, + { + "epoch": 0.48, + "grad_norm": 1.4115746021270752, + "learning_rate": 5.5281450910336e-06, + "loss": 0.6768, + "step": 3765 + }, + { + "epoch": 0.48, + "grad_norm": 2.208176612854004, + "learning_rate": 5.526081625962758e-06, + "loss": 0.5841, + "step": 3766 + }, + { + "epoch": 0.48, + "grad_norm": 1.182554841041565, + "learning_rate": 5.524018070285047e-06, + "loss": 0.6007, + "step": 3767 + }, + { + "epoch": 0.48, + "grad_norm": 1.3877570629119873, + "learning_rate": 5.521954424355876e-06, + "loss": 0.6617, + "step": 3768 + }, + { + "epoch": 0.48, + "grad_norm": 1.7248724699020386, + "learning_rate": 5.519890688530666e-06, + "loss": 0.6391, + "step": 3769 + }, + { + "epoch": 0.48, + "grad_norm": 1.0161206722259521, + "learning_rate": 5.517826863164849e-06, + "loss": 0.4795, + "step": 3770 + }, + { + "epoch": 0.48, + "grad_norm": 1.1962370872497559, + "learning_rate": 5.515762948613882e-06, + "loss": 0.6225, + "step": 3771 + }, + { + "epoch": 0.48, + "grad_norm": 1.3718147277832031, + "learning_rate": 5.51369894523323e-06, + "loss": 0.6185, + "step": 3772 + }, + { + "epoch": 0.48, + "grad_norm": 1.4110352993011475, + "learning_rate": 5.5116348533783755e-06, + "loss": 0.6968, + "step": 3773 + }, + { + "epoch": 0.48, + "grad_norm": 1.2778741121292114, + "learning_rate": 5.509570673404819e-06, + "loss": 0.5913, + "step": 3774 + }, + { + "epoch": 0.48, + "grad_norm": 0.9584821462631226, + "learning_rate": 5.50750640566807e-06, + "loss": 0.5655, + "step": 3775 + }, + { + "epoch": 0.48, + "grad_norm": 1.1385620832443237, + "learning_rate": 5.505442050523655e-06, + "loss": 0.703, + "step": 3776 + }, + { + "epoch": 0.48, + "grad_norm": 1.690279483795166, + "learning_rate": 5.503377608327124e-06, + "loss": 0.5459, + "step": 3777 + }, + { + "epoch": 0.48, + "grad_norm": 1.3032708168029785, + "learning_rate": 5.50131307943403e-06, + "loss": 0.6358, + "step": 3778 + }, + { + "epoch": 0.48, + "grad_norm": 1.129211187362671, + "learning_rate": 5.499248464199949e-06, + "loss": 0.5988, + "step": 3779 + }, + { + "epoch": 0.48, + "grad_norm": 1.3283401727676392, + "learning_rate": 5.497183762980467e-06, + "loss": 0.6212, + "step": 3780 + }, + { + "epoch": 0.48, + "grad_norm": 1.4811829328536987, + "learning_rate": 5.495118976131187e-06, + "loss": 0.5867, + "step": 3781 + }, + { + "epoch": 0.48, + "grad_norm": 1.861901044845581, + "learning_rate": 5.493054104007728e-06, + "loss": 0.6727, + "step": 3782 + }, + { + "epoch": 0.48, + "grad_norm": 1.4480305910110474, + "learning_rate": 5.49098914696572e-06, + "loss": 0.6174, + "step": 3783 + }, + { + "epoch": 0.48, + "grad_norm": 1.423263669013977, + "learning_rate": 5.488924105360812e-06, + "loss": 0.6232, + "step": 3784 + }, + { + "epoch": 0.48, + "grad_norm": 1.5136548280715942, + "learning_rate": 5.486858979548663e-06, + "loss": 0.5616, + "step": 3785 + }, + { + "epoch": 0.49, + "grad_norm": 1.1748825311660767, + "learning_rate": 5.484793769884953e-06, + "loss": 0.5792, + "step": 3786 + }, + { + "epoch": 0.49, + "grad_norm": 1.2067316770553589, + "learning_rate": 5.482728476725369e-06, + "loss": 0.5731, + "step": 3787 + }, + { + "epoch": 0.49, + "grad_norm": 1.8080260753631592, + "learning_rate": 5.480663100425616e-06, + "loss": 0.6952, + "step": 3788 + }, + { + "epoch": 0.49, + "grad_norm": 1.5747623443603516, + "learning_rate": 5.478597641341414e-06, + "loss": 0.5669, + "step": 3789 + }, + { + "epoch": 0.49, + "grad_norm": 1.224440574645996, + "learning_rate": 5.476532099828498e-06, + "loss": 0.5727, + "step": 3790 + }, + { + "epoch": 0.49, + "grad_norm": 1.2634854316711426, + "learning_rate": 5.474466476242611e-06, + "loss": 0.6208, + "step": 3791 + }, + { + "epoch": 0.49, + "grad_norm": 1.3283199071884155, + "learning_rate": 5.472400770939519e-06, + "loss": 0.6815, + "step": 3792 + }, + { + "epoch": 0.49, + "grad_norm": 1.3618751764297485, + "learning_rate": 5.470334984274995e-06, + "loss": 0.58, + "step": 3793 + }, + { + "epoch": 0.49, + "grad_norm": 1.400259256362915, + "learning_rate": 5.468269116604831e-06, + "loss": 0.6144, + "step": 3794 + }, + { + "epoch": 0.49, + "grad_norm": 1.376218557357788, + "learning_rate": 5.466203168284829e-06, + "loss": 0.6464, + "step": 3795 + }, + { + "epoch": 0.49, + "grad_norm": 1.2561371326446533, + "learning_rate": 5.464137139670806e-06, + "loss": 0.5849, + "step": 3796 + }, + { + "epoch": 0.49, + "grad_norm": 1.5874736309051514, + "learning_rate": 5.4620710311185976e-06, + "loss": 0.6173, + "step": 3797 + }, + { + "epoch": 0.49, + "grad_norm": 1.9282337427139282, + "learning_rate": 5.460004842984044e-06, + "loss": 0.5781, + "step": 3798 + }, + { + "epoch": 0.49, + "grad_norm": 1.2490649223327637, + "learning_rate": 5.457938575623008e-06, + "loss": 0.6326, + "step": 3799 + }, + { + "epoch": 0.49, + "grad_norm": 1.4292243719100952, + "learning_rate": 5.455872229391357e-06, + "loss": 0.5963, + "step": 3800 + }, + { + "epoch": 0.49, + "grad_norm": 1.1780860424041748, + "learning_rate": 5.453805804644984e-06, + "loss": 0.5695, + "step": 3801 + }, + { + "epoch": 0.49, + "grad_norm": 1.3638889789581299, + "learning_rate": 5.451739301739782e-06, + "loss": 0.5805, + "step": 3802 + }, + { + "epoch": 0.49, + "grad_norm": 1.8172688484191895, + "learning_rate": 5.449672721031668e-06, + "loss": 0.5556, + "step": 3803 + }, + { + "epoch": 0.49, + "grad_norm": 1.3854845762252808, + "learning_rate": 5.447606062876569e-06, + "loss": 0.6247, + "step": 3804 + }, + { + "epoch": 0.49, + "grad_norm": 1.3911125659942627, + "learning_rate": 5.445539327630423e-06, + "loss": 0.5315, + "step": 3805 + }, + { + "epoch": 0.49, + "grad_norm": 1.301125407218933, + "learning_rate": 5.443472515649184e-06, + "loss": 0.6608, + "step": 3806 + }, + { + "epoch": 0.49, + "grad_norm": 1.013878583908081, + "learning_rate": 5.441405627288817e-06, + "loss": 0.5742, + "step": 3807 + }, + { + "epoch": 0.49, + "grad_norm": 2.087625026702881, + "learning_rate": 5.439338662905305e-06, + "loss": 0.6331, + "step": 3808 + }, + { + "epoch": 0.49, + "grad_norm": 1.1464579105377197, + "learning_rate": 5.437271622854635e-06, + "loss": 0.6458, + "step": 3809 + }, + { + "epoch": 0.49, + "grad_norm": 1.453547477722168, + "learning_rate": 5.435204507492819e-06, + "loss": 0.6398, + "step": 3810 + }, + { + "epoch": 0.49, + "grad_norm": 1.6091513633728027, + "learning_rate": 5.433137317175868e-06, + "loss": 0.5774, + "step": 3811 + }, + { + "epoch": 0.49, + "grad_norm": 1.2415236234664917, + "learning_rate": 5.431070052259821e-06, + "loss": 0.7783, + "step": 3812 + }, + { + "epoch": 0.49, + "grad_norm": 1.310966968536377, + "learning_rate": 5.42900271310072e-06, + "loss": 0.7047, + "step": 3813 + }, + { + "epoch": 0.49, + "grad_norm": 1.3866527080535889, + "learning_rate": 5.426935300054621e-06, + "loss": 0.6902, + "step": 3814 + }, + { + "epoch": 0.49, + "grad_norm": 1.3021939992904663, + "learning_rate": 5.424867813477593e-06, + "loss": 0.5693, + "step": 3815 + }, + { + "epoch": 0.49, + "grad_norm": 1.2108157873153687, + "learning_rate": 5.422800253725722e-06, + "loss": 0.5726, + "step": 3816 + }, + { + "epoch": 0.49, + "grad_norm": 1.936347246170044, + "learning_rate": 5.4207326211550995e-06, + "loss": 0.6049, + "step": 3817 + }, + { + "epoch": 0.49, + "grad_norm": 1.1145234107971191, + "learning_rate": 5.418664916121835e-06, + "loss": 0.5592, + "step": 3818 + }, + { + "epoch": 0.49, + "grad_norm": 5.271641254425049, + "learning_rate": 5.416597138982048e-06, + "loss": 0.5187, + "step": 3819 + }, + { + "epoch": 0.49, + "grad_norm": 1.0878288745880127, + "learning_rate": 5.414529290091872e-06, + "loss": 0.6153, + "step": 3820 + }, + { + "epoch": 0.49, + "grad_norm": 1.5007294416427612, + "learning_rate": 5.412461369807451e-06, + "loss": 0.621, + "step": 3821 + }, + { + "epoch": 0.49, + "grad_norm": 1.5491704940795898, + "learning_rate": 5.4103933784849435e-06, + "loss": 0.723, + "step": 3822 + }, + { + "epoch": 0.49, + "grad_norm": 1.1629546880722046, + "learning_rate": 5.408325316480518e-06, + "loss": 0.5476, + "step": 3823 + }, + { + "epoch": 0.49, + "grad_norm": 1.2649697065353394, + "learning_rate": 5.406257184150355e-06, + "loss": 0.6794, + "step": 3824 + }, + { + "epoch": 0.49, + "grad_norm": 1.6518555879592896, + "learning_rate": 5.404188981850651e-06, + "loss": 0.6568, + "step": 3825 + }, + { + "epoch": 0.49, + "grad_norm": 1.191433310508728, + "learning_rate": 5.4021207099376095e-06, + "loss": 0.6148, + "step": 3826 + }, + { + "epoch": 0.49, + "grad_norm": 1.2688812017440796, + "learning_rate": 5.40005236876745e-06, + "loss": 0.6096, + "step": 3827 + }, + { + "epoch": 0.49, + "grad_norm": 1.7701101303100586, + "learning_rate": 5.3979839586964e-06, + "loss": 0.5587, + "step": 3828 + }, + { + "epoch": 0.49, + "grad_norm": 1.3165203332901, + "learning_rate": 5.3959154800807025e-06, + "loss": 0.5454, + "step": 3829 + }, + { + "epoch": 0.49, + "grad_norm": 1.3157920837402344, + "learning_rate": 5.393846933276612e-06, + "loss": 0.6478, + "step": 3830 + }, + { + "epoch": 0.49, + "grad_norm": 1.2996734380722046, + "learning_rate": 5.391778318640392e-06, + "loss": 0.5351, + "step": 3831 + }, + { + "epoch": 0.49, + "grad_norm": 1.2892003059387207, + "learning_rate": 5.38970963652832e-06, + "loss": 0.5722, + "step": 3832 + }, + { + "epoch": 0.49, + "grad_norm": 1.160461187362671, + "learning_rate": 5.387640887296683e-06, + "loss": 0.6198, + "step": 3833 + }, + { + "epoch": 0.49, + "grad_norm": 1.233054757118225, + "learning_rate": 5.385572071301785e-06, + "loss": 0.5894, + "step": 3834 + }, + { + "epoch": 0.49, + "grad_norm": 1.3636419773101807, + "learning_rate": 5.383503188899932e-06, + "loss": 0.6296, + "step": 3835 + }, + { + "epoch": 0.49, + "grad_norm": 1.6694684028625488, + "learning_rate": 5.381434240447451e-06, + "loss": 0.6466, + "step": 3836 + }, + { + "epoch": 0.49, + "grad_norm": 1.753450632095337, + "learning_rate": 5.379365226300673e-06, + "loss": 0.6464, + "step": 3837 + }, + { + "epoch": 0.49, + "grad_norm": 1.236421823501587, + "learning_rate": 5.377296146815949e-06, + "loss": 0.5271, + "step": 3838 + }, + { + "epoch": 0.49, + "grad_norm": 1.3160569667816162, + "learning_rate": 5.37522700234963e-06, + "loss": 0.7266, + "step": 3839 + }, + { + "epoch": 0.49, + "grad_norm": 1.5594069957733154, + "learning_rate": 5.373157793258088e-06, + "loss": 0.6761, + "step": 3840 + }, + { + "epoch": 0.49, + "grad_norm": 1.5676730871200562, + "learning_rate": 5.3710885198977004e-06, + "loss": 0.5312, + "step": 3841 + }, + { + "epoch": 0.49, + "grad_norm": 1.762484073638916, + "learning_rate": 5.369019182624858e-06, + "loss": 0.6218, + "step": 3842 + }, + { + "epoch": 0.49, + "grad_norm": 1.2499971389770508, + "learning_rate": 5.366949781795961e-06, + "loss": 0.6244, + "step": 3843 + }, + { + "epoch": 0.49, + "grad_norm": 1.2467316389083862, + "learning_rate": 5.364880317767424e-06, + "loss": 0.6309, + "step": 3844 + }, + { + "epoch": 0.49, + "grad_norm": 1.1873960494995117, + "learning_rate": 5.362810790895668e-06, + "loss": 0.5959, + "step": 3845 + }, + { + "epoch": 0.49, + "grad_norm": 1.2639389038085938, + "learning_rate": 5.360741201537127e-06, + "loss": 0.6166, + "step": 3846 + }, + { + "epoch": 0.49, + "grad_norm": 1.4507334232330322, + "learning_rate": 5.358671550048249e-06, + "loss": 0.5928, + "step": 3847 + }, + { + "epoch": 0.49, + "grad_norm": 1.1256029605865479, + "learning_rate": 5.356601836785484e-06, + "loss": 0.5958, + "step": 3848 + }, + { + "epoch": 0.49, + "grad_norm": 1.2243516445159912, + "learning_rate": 5.354532062105303e-06, + "loss": 0.5836, + "step": 3849 + }, + { + "epoch": 0.49, + "grad_norm": 1.5672364234924316, + "learning_rate": 5.352462226364179e-06, + "loss": 0.6359, + "step": 3850 + }, + { + "epoch": 0.49, + "grad_norm": 1.424888014793396, + "learning_rate": 5.350392329918601e-06, + "loss": 0.6901, + "step": 3851 + }, + { + "epoch": 0.49, + "grad_norm": 3.7046003341674805, + "learning_rate": 5.348322373125065e-06, + "loss": 0.6442, + "step": 3852 + }, + { + "epoch": 0.49, + "grad_norm": 1.4008851051330566, + "learning_rate": 5.346252356340082e-06, + "loss": 0.5696, + "step": 3853 + }, + { + "epoch": 0.49, + "grad_norm": 1.6207784414291382, + "learning_rate": 5.344182279920167e-06, + "loss": 0.5765, + "step": 3854 + }, + { + "epoch": 0.49, + "grad_norm": 1.579346776008606, + "learning_rate": 5.342112144221851e-06, + "loss": 0.6231, + "step": 3855 + }, + { + "epoch": 0.49, + "grad_norm": 1.2518047094345093, + "learning_rate": 5.340041949601672e-06, + "loss": 0.5404, + "step": 3856 + }, + { + "epoch": 0.49, + "grad_norm": 1.0116968154907227, + "learning_rate": 5.337971696416178e-06, + "loss": 0.7541, + "step": 3857 + }, + { + "epoch": 0.49, + "grad_norm": 1.3027863502502441, + "learning_rate": 5.33590138502193e-06, + "loss": 0.626, + "step": 3858 + }, + { + "epoch": 0.49, + "grad_norm": 2.156329870223999, + "learning_rate": 5.3338310157754934e-06, + "loss": 0.6029, + "step": 3859 + }, + { + "epoch": 0.49, + "grad_norm": 1.7596616744995117, + "learning_rate": 5.331760589033452e-06, + "loss": 0.6789, + "step": 3860 + }, + { + "epoch": 0.49, + "grad_norm": 1.0322740077972412, + "learning_rate": 5.329690105152392e-06, + "loss": 0.637, + "step": 3861 + }, + { + "epoch": 0.49, + "grad_norm": 1.2053539752960205, + "learning_rate": 5.3276195644889115e-06, + "loss": 0.5274, + "step": 3862 + }, + { + "epoch": 0.49, + "grad_norm": 1.20881986618042, + "learning_rate": 5.325548967399621e-06, + "loss": 0.636, + "step": 3863 + }, + { + "epoch": 0.5, + "grad_norm": 1.379366159439087, + "learning_rate": 5.323478314241138e-06, + "loss": 0.598, + "step": 3864 + }, + { + "epoch": 0.5, + "grad_norm": 1.3362188339233398, + "learning_rate": 5.321407605370091e-06, + "loss": 0.7068, + "step": 3865 + }, + { + "epoch": 0.5, + "grad_norm": 1.4871771335601807, + "learning_rate": 5.319336841143116e-06, + "loss": 0.6618, + "step": 3866 + }, + { + "epoch": 0.5, + "grad_norm": 1.288758397102356, + "learning_rate": 5.31726602191686e-06, + "loss": 0.7501, + "step": 3867 + }, + { + "epoch": 0.5, + "grad_norm": 1.3813121318817139, + "learning_rate": 5.315195148047981e-06, + "loss": 0.6301, + "step": 3868 + }, + { + "epoch": 0.5, + "grad_norm": 1.2369877099990845, + "learning_rate": 5.313124219893145e-06, + "loss": 0.5641, + "step": 3869 + }, + { + "epoch": 0.5, + "grad_norm": 1.0300804376602173, + "learning_rate": 5.311053237809026e-06, + "loss": 0.6737, + "step": 3870 + }, + { + "epoch": 0.5, + "grad_norm": 1.3140389919281006, + "learning_rate": 5.30898220215231e-06, + "loss": 0.5421, + "step": 3871 + }, + { + "epoch": 0.5, + "grad_norm": 1.4346362352371216, + "learning_rate": 5.306911113279689e-06, + "loss": 0.6743, + "step": 3872 + }, + { + "epoch": 0.5, + "grad_norm": 1.0767395496368408, + "learning_rate": 5.3048399715478675e-06, + "loss": 0.6462, + "step": 3873 + }, + { + "epoch": 0.5, + "grad_norm": 1.3019919395446777, + "learning_rate": 5.302768777313556e-06, + "loss": 0.7012, + "step": 3874 + }, + { + "epoch": 0.5, + "grad_norm": 1.2964775562286377, + "learning_rate": 5.300697530933479e-06, + "loss": 0.4923, + "step": 3875 + }, + { + "epoch": 0.5, + "grad_norm": 1.0694615840911865, + "learning_rate": 5.2986262327643615e-06, + "loss": 0.5454, + "step": 3876 + }, + { + "epoch": 0.5, + "grad_norm": 1.3932045698165894, + "learning_rate": 5.296554883162947e-06, + "loss": 0.5652, + "step": 3877 + }, + { + "epoch": 0.5, + "grad_norm": 1.147936463356018, + "learning_rate": 5.29448348248598e-06, + "loss": 0.6513, + "step": 3878 + }, + { + "epoch": 0.5, + "grad_norm": 1.2141468524932861, + "learning_rate": 5.29241203109022e-06, + "loss": 0.5956, + "step": 3879 + }, + { + "epoch": 0.5, + "grad_norm": 1.1196589469909668, + "learning_rate": 5.2903405293324316e-06, + "loss": 0.5876, + "step": 3880 + }, + { + "epoch": 0.5, + "grad_norm": 1.196601152420044, + "learning_rate": 5.288268977569386e-06, + "loss": 0.6035, + "step": 3881 + }, + { + "epoch": 0.5, + "grad_norm": 1.4775373935699463, + "learning_rate": 5.286197376157871e-06, + "loss": 0.6348, + "step": 3882 + }, + { + "epoch": 0.5, + "grad_norm": 1.283056378364563, + "learning_rate": 5.284125725454674e-06, + "loss": 0.64, + "step": 3883 + }, + { + "epoch": 0.5, + "grad_norm": 1.263865351676941, + "learning_rate": 5.282054025816597e-06, + "loss": 0.6002, + "step": 3884 + }, + { + "epoch": 0.5, + "grad_norm": 1.2261013984680176, + "learning_rate": 5.279982277600445e-06, + "loss": 0.5812, + "step": 3885 + }, + { + "epoch": 0.5, + "grad_norm": 3.103773355484009, + "learning_rate": 5.277910481163038e-06, + "loss": 0.6105, + "step": 3886 + }, + { + "epoch": 0.5, + "grad_norm": 1.412146806716919, + "learning_rate": 5.275838636861198e-06, + "loss": 0.6886, + "step": 3887 + }, + { + "epoch": 0.5, + "grad_norm": 2.211158037185669, + "learning_rate": 5.273766745051761e-06, + "loss": 0.5951, + "step": 3888 + }, + { + "epoch": 0.5, + "grad_norm": 1.4581794738769531, + "learning_rate": 5.271694806091564e-06, + "loss": 0.64, + "step": 3889 + }, + { + "epoch": 0.5, + "grad_norm": 1.1060121059417725, + "learning_rate": 5.269622820337462e-06, + "loss": 0.5375, + "step": 3890 + }, + { + "epoch": 0.5, + "grad_norm": 3.2724769115448, + "learning_rate": 5.267550788146308e-06, + "loss": 0.5919, + "step": 3891 + }, + { + "epoch": 0.5, + "grad_norm": 1.2122722864151, + "learning_rate": 5.26547870987497e-06, + "loss": 0.6125, + "step": 3892 + }, + { + "epoch": 0.5, + "grad_norm": 1.1100267171859741, + "learning_rate": 5.26340658588032e-06, + "loss": 0.5679, + "step": 3893 + }, + { + "epoch": 0.5, + "grad_norm": 1.2956206798553467, + "learning_rate": 5.261334416519239e-06, + "loss": 0.6334, + "step": 3894 + }, + { + "epoch": 0.5, + "grad_norm": 1.3694170713424683, + "learning_rate": 5.259262202148616e-06, + "loss": 0.6584, + "step": 3895 + }, + { + "epoch": 0.5, + "grad_norm": 1.3362345695495605, + "learning_rate": 5.257189943125349e-06, + "loss": 0.5786, + "step": 3896 + }, + { + "epoch": 0.5, + "grad_norm": 1.2397748231887817, + "learning_rate": 5.255117639806343e-06, + "loss": 0.5881, + "step": 3897 + }, + { + "epoch": 0.5, + "grad_norm": 1.4001562595367432, + "learning_rate": 5.253045292548508e-06, + "loss": 0.646, + "step": 3898 + }, + { + "epoch": 0.5, + "grad_norm": 1.3735246658325195, + "learning_rate": 5.250972901708765e-06, + "loss": 0.8063, + "step": 3899 + }, + { + "epoch": 0.5, + "grad_norm": 1.1073076725006104, + "learning_rate": 5.248900467644041e-06, + "loss": 0.6368, + "step": 3900 + }, + { + "epoch": 0.5, + "grad_norm": 1.3196773529052734, + "learning_rate": 5.246827990711269e-06, + "loss": 0.5578, + "step": 3901 + }, + { + "epoch": 0.5, + "grad_norm": 1.3190702199935913, + "learning_rate": 5.244755471267394e-06, + "loss": 0.5742, + "step": 3902 + }, + { + "epoch": 0.5, + "grad_norm": 1.3288342952728271, + "learning_rate": 5.242682909669364e-06, + "loss": 0.5448, + "step": 3903 + }, + { + "epoch": 0.5, + "grad_norm": 1.3225739002227783, + "learning_rate": 5.240610306274134e-06, + "loss": 0.6489, + "step": 3904 + }, + { + "epoch": 0.5, + "grad_norm": 1.309205174446106, + "learning_rate": 5.238537661438671e-06, + "loss": 0.6241, + "step": 3905 + }, + { + "epoch": 0.5, + "grad_norm": 1.2318326234817505, + "learning_rate": 5.236464975519944e-06, + "loss": 0.5419, + "step": 3906 + }, + { + "epoch": 0.5, + "grad_norm": 1.4670156240463257, + "learning_rate": 5.234392248874931e-06, + "loss": 0.6127, + "step": 3907 + }, + { + "epoch": 0.5, + "grad_norm": 2.364734411239624, + "learning_rate": 5.232319481860618e-06, + "loss": 0.6175, + "step": 3908 + }, + { + "epoch": 0.5, + "grad_norm": 1.1357091665267944, + "learning_rate": 5.2302466748339954e-06, + "loss": 0.6176, + "step": 3909 + }, + { + "epoch": 0.5, + "grad_norm": 1.4520831108093262, + "learning_rate": 5.228173828152063e-06, + "loss": 0.5557, + "step": 3910 + }, + { + "epoch": 0.5, + "grad_norm": 1.7567130327224731, + "learning_rate": 5.226100942171827e-06, + "loss": 0.5868, + "step": 3911 + }, + { + "epoch": 0.5, + "grad_norm": 1.0778729915618896, + "learning_rate": 5.2240280172503e-06, + "loss": 0.6589, + "step": 3912 + }, + { + "epoch": 0.5, + "grad_norm": 1.256453037261963, + "learning_rate": 5.221955053744498e-06, + "loss": 0.6077, + "step": 3913 + }, + { + "epoch": 0.5, + "grad_norm": 4.119685649871826, + "learning_rate": 5.219882052011451e-06, + "loss": 0.6429, + "step": 3914 + }, + { + "epoch": 0.5, + "grad_norm": 1.4709926843643188, + "learning_rate": 5.217809012408191e-06, + "loss": 0.621, + "step": 3915 + }, + { + "epoch": 0.5, + "grad_norm": 1.3248754739761353, + "learning_rate": 5.215735935291755e-06, + "loss": 0.656, + "step": 3916 + }, + { + "epoch": 0.5, + "grad_norm": 1.7260463237762451, + "learning_rate": 5.213662821019187e-06, + "loss": 0.6932, + "step": 3917 + }, + { + "epoch": 0.5, + "grad_norm": 1.4484518766403198, + "learning_rate": 5.211589669947544e-06, + "loss": 0.5989, + "step": 3918 + }, + { + "epoch": 0.5, + "grad_norm": 1.3106611967086792, + "learning_rate": 5.209516482433879e-06, + "loss": 0.682, + "step": 3919 + }, + { + "epoch": 0.5, + "grad_norm": 1.3350433111190796, + "learning_rate": 5.207443258835261e-06, + "loss": 0.6008, + "step": 3920 + }, + { + "epoch": 0.5, + "grad_norm": 1.8616951704025269, + "learning_rate": 5.205369999508756e-06, + "loss": 0.5976, + "step": 3921 + }, + { + "epoch": 0.5, + "grad_norm": 1.2377861738204956, + "learning_rate": 5.203296704811443e-06, + "loss": 0.5425, + "step": 3922 + }, + { + "epoch": 0.5, + "grad_norm": 1.428403615951538, + "learning_rate": 5.201223375100406e-06, + "loss": 0.6628, + "step": 3923 + }, + { + "epoch": 0.5, + "grad_norm": 1.4202046394348145, + "learning_rate": 5.199150010732731e-06, + "loss": 0.567, + "step": 3924 + }, + { + "epoch": 0.5, + "grad_norm": 1.4062016010284424, + "learning_rate": 5.1970766120655155e-06, + "loss": 0.6508, + "step": 3925 + }, + { + "epoch": 0.5, + "grad_norm": 1.2854269742965698, + "learning_rate": 5.195003179455859e-06, + "loss": 0.6857, + "step": 3926 + }, + { + "epoch": 0.5, + "grad_norm": 1.979161024093628, + "learning_rate": 5.192929713260869e-06, + "loss": 0.6199, + "step": 3927 + }, + { + "epoch": 0.5, + "grad_norm": 1.3159161806106567, + "learning_rate": 5.190856213837656e-06, + "loss": 0.6078, + "step": 3928 + }, + { + "epoch": 0.5, + "grad_norm": 1.0987286567687988, + "learning_rate": 5.1887826815433415e-06, + "loss": 0.5123, + "step": 3929 + }, + { + "epoch": 0.5, + "grad_norm": 1.5091686248779297, + "learning_rate": 5.186709116735046e-06, + "loss": 0.596, + "step": 3930 + }, + { + "epoch": 0.5, + "grad_norm": 1.3786001205444336, + "learning_rate": 5.1846355197699e-06, + "loss": 0.6173, + "step": 3931 + }, + { + "epoch": 0.5, + "grad_norm": 1.4815407991409302, + "learning_rate": 5.182561891005039e-06, + "loss": 0.5635, + "step": 3932 + }, + { + "epoch": 0.5, + "grad_norm": 1.4058189392089844, + "learning_rate": 5.180488230797602e-06, + "loss": 0.5896, + "step": 3933 + }, + { + "epoch": 0.5, + "grad_norm": 1.2490657567977905, + "learning_rate": 5.178414539504737e-06, + "loss": 0.596, + "step": 3934 + }, + { + "epoch": 0.5, + "grad_norm": 1.224664330482483, + "learning_rate": 5.176340817483592e-06, + "loss": 0.6107, + "step": 3935 + }, + { + "epoch": 0.5, + "grad_norm": 1.3340404033660889, + "learning_rate": 5.174267065091329e-06, + "loss": 0.5858, + "step": 3936 + }, + { + "epoch": 0.5, + "grad_norm": 1.558334469795227, + "learning_rate": 5.172193282685104e-06, + "loss": 0.5914, + "step": 3937 + }, + { + "epoch": 0.5, + "grad_norm": 1.1823575496673584, + "learning_rate": 5.170119470622086e-06, + "loss": 0.5338, + "step": 3938 + }, + { + "epoch": 0.5, + "grad_norm": 1.632826328277588, + "learning_rate": 5.168045629259446e-06, + "loss": 0.7068, + "step": 3939 + }, + { + "epoch": 0.5, + "grad_norm": 1.282450556755066, + "learning_rate": 5.1659717589543635e-06, + "loss": 0.6205, + "step": 3940 + }, + { + "epoch": 0.5, + "grad_norm": 2.351825714111328, + "learning_rate": 5.163897860064019e-06, + "loss": 0.6188, + "step": 3941 + }, + { + "epoch": 0.51, + "grad_norm": 1.1424978971481323, + "learning_rate": 5.1618239329456e-06, + "loss": 0.5913, + "step": 3942 + }, + { + "epoch": 0.51, + "grad_norm": 1.2732305526733398, + "learning_rate": 5.159749977956295e-06, + "loss": 0.572, + "step": 3943 + }, + { + "epoch": 0.51, + "grad_norm": 1.20172917842865, + "learning_rate": 5.157675995453306e-06, + "loss": 0.7332, + "step": 3944 + }, + { + "epoch": 0.51, + "grad_norm": 1.1711151599884033, + "learning_rate": 5.155601985793831e-06, + "loss": 0.5454, + "step": 3945 + }, + { + "epoch": 0.51, + "grad_norm": 1.4528971910476685, + "learning_rate": 5.153527949335077e-06, + "loss": 0.5712, + "step": 3946 + }, + { + "epoch": 0.51, + "grad_norm": 1.6643458604812622, + "learning_rate": 5.151453886434255e-06, + "loss": 0.6729, + "step": 3947 + }, + { + "epoch": 0.51, + "grad_norm": 1.3720951080322266, + "learning_rate": 5.149379797448577e-06, + "loss": 0.6249, + "step": 3948 + }, + { + "epoch": 0.51, + "grad_norm": 1.305267572402954, + "learning_rate": 5.147305682735266e-06, + "loss": 0.5595, + "step": 3949 + }, + { + "epoch": 0.51, + "grad_norm": 1.398755669593811, + "learning_rate": 5.145231542651547e-06, + "loss": 0.5929, + "step": 3950 + }, + { + "epoch": 0.51, + "grad_norm": 1.1850718259811401, + "learning_rate": 5.143157377554645e-06, + "loss": 0.6145, + "step": 3951 + }, + { + "epoch": 0.51, + "grad_norm": 1.3608309030532837, + "learning_rate": 5.1410831878017945e-06, + "loss": 0.6691, + "step": 3952 + }, + { + "epoch": 0.51, + "grad_norm": 1.1051334142684937, + "learning_rate": 5.139008973750234e-06, + "loss": 0.6059, + "step": 3953 + }, + { + "epoch": 0.51, + "grad_norm": 1.0188237428665161, + "learning_rate": 5.136934735757202e-06, + "loss": 0.6137, + "step": 3954 + }, + { + "epoch": 0.51, + "grad_norm": 1.2129838466644287, + "learning_rate": 5.1348604741799455e-06, + "loss": 0.5999, + "step": 3955 + }, + { + "epoch": 0.51, + "grad_norm": 2.062683343887329, + "learning_rate": 5.1327861893757125e-06, + "loss": 0.5963, + "step": 3956 + }, + { + "epoch": 0.51, + "grad_norm": 1.2500804662704468, + "learning_rate": 5.1307118817017575e-06, + "loss": 0.6215, + "step": 3957 + }, + { + "epoch": 0.51, + "grad_norm": 1.3206661939620972, + "learning_rate": 5.12863755151534e-06, + "loss": 0.7026, + "step": 3958 + }, + { + "epoch": 0.51, + "grad_norm": 2.201270580291748, + "learning_rate": 5.1265631991737165e-06, + "loss": 0.6585, + "step": 3959 + }, + { + "epoch": 0.51, + "grad_norm": 1.1174263954162598, + "learning_rate": 5.124488825034155e-06, + "loss": 0.5546, + "step": 3960 + }, + { + "epoch": 0.51, + "grad_norm": 1.306179165840149, + "learning_rate": 5.122414429453923e-06, + "loss": 0.6222, + "step": 3961 + }, + { + "epoch": 0.51, + "grad_norm": 1.289539098739624, + "learning_rate": 5.120340012790296e-06, + "loss": 0.6138, + "step": 3962 + }, + { + "epoch": 0.51, + "grad_norm": 1.3443716764450073, + "learning_rate": 5.118265575400546e-06, + "loss": 0.5832, + "step": 3963 + }, + { + "epoch": 0.51, + "grad_norm": 1.2395848035812378, + "learning_rate": 5.116191117641955e-06, + "loss": 0.6501, + "step": 3964 + }, + { + "epoch": 0.51, + "grad_norm": 1.1485309600830078, + "learning_rate": 5.114116639871804e-06, + "loss": 0.579, + "step": 3965 + }, + { + "epoch": 0.51, + "grad_norm": 1.2162903547286987, + "learning_rate": 5.112042142447384e-06, + "loss": 0.5577, + "step": 3966 + }, + { + "epoch": 0.51, + "grad_norm": 1.1788960695266724, + "learning_rate": 5.10996762572598e-06, + "loss": 0.5424, + "step": 3967 + }, + { + "epoch": 0.51, + "grad_norm": 1.1972217559814453, + "learning_rate": 5.10789309006489e-06, + "loss": 0.5713, + "step": 3968 + }, + { + "epoch": 0.51, + "grad_norm": 1.1885600090026855, + "learning_rate": 5.105818535821406e-06, + "loss": 0.5696, + "step": 3969 + }, + { + "epoch": 0.51, + "grad_norm": 1.4530361890792847, + "learning_rate": 5.103743963352832e-06, + "loss": 0.6195, + "step": 3970 + }, + { + "epoch": 0.51, + "grad_norm": 1.3970437049865723, + "learning_rate": 5.101669373016469e-06, + "loss": 0.7098, + "step": 3971 + }, + { + "epoch": 0.51, + "grad_norm": 1.6319563388824463, + "learning_rate": 5.099594765169621e-06, + "loss": 0.5933, + "step": 3972 + }, + { + "epoch": 0.51, + "grad_norm": 1.417358636856079, + "learning_rate": 5.097520140169599e-06, + "loss": 0.6566, + "step": 3973 + }, + { + "epoch": 0.51, + "grad_norm": 1.314659833908081, + "learning_rate": 5.095445498373717e-06, + "loss": 0.6062, + "step": 3974 + }, + { + "epoch": 0.51, + "grad_norm": 1.4447764158248901, + "learning_rate": 5.0933708401392864e-06, + "loss": 0.5829, + "step": 3975 + }, + { + "epoch": 0.51, + "grad_norm": 1.1412814855575562, + "learning_rate": 5.091296165823627e-06, + "loss": 0.5649, + "step": 3976 + }, + { + "epoch": 0.51, + "grad_norm": 3.5301032066345215, + "learning_rate": 5.08922147578406e-06, + "loss": 0.5886, + "step": 3977 + }, + { + "epoch": 0.51, + "grad_norm": 1.2669841051101685, + "learning_rate": 5.0871467703779054e-06, + "loss": 0.5541, + "step": 3978 + }, + { + "epoch": 0.51, + "grad_norm": 1.5283225774765015, + "learning_rate": 5.0850720499624915e-06, + "loss": 0.5673, + "step": 3979 + }, + { + "epoch": 0.51, + "grad_norm": 1.3331176042556763, + "learning_rate": 5.082997314895146e-06, + "loss": 0.6687, + "step": 3980 + }, + { + "epoch": 0.51, + "grad_norm": 1.5164918899536133, + "learning_rate": 5.080922565533201e-06, + "loss": 0.6221, + "step": 3981 + }, + { + "epoch": 0.51, + "grad_norm": 1.4119714498519897, + "learning_rate": 5.078847802233987e-06, + "loss": 0.5823, + "step": 3982 + }, + { + "epoch": 0.51, + "grad_norm": 1.263611912727356, + "learning_rate": 5.076773025354843e-06, + "loss": 0.6335, + "step": 3983 + }, + { + "epoch": 0.51, + "grad_norm": 1.6149969100952148, + "learning_rate": 5.074698235253106e-06, + "loss": 0.6478, + "step": 3984 + }, + { + "epoch": 0.51, + "grad_norm": 1.2421183586120605, + "learning_rate": 5.072623432286116e-06, + "loss": 0.7456, + "step": 3985 + }, + { + "epoch": 0.51, + "grad_norm": 1.2369145154953003, + "learning_rate": 5.070548616811216e-06, + "loss": 0.5444, + "step": 3986 + }, + { + "epoch": 0.51, + "grad_norm": 1.4720882177352905, + "learning_rate": 5.0684737891857505e-06, + "loss": 0.5883, + "step": 3987 + }, + { + "epoch": 0.51, + "grad_norm": 1.7480475902557373, + "learning_rate": 5.066398949767068e-06, + "loss": 0.5908, + "step": 3988 + }, + { + "epoch": 0.51, + "grad_norm": 1.277265191078186, + "learning_rate": 5.064324098912513e-06, + "loss": 0.5804, + "step": 3989 + }, + { + "epoch": 0.51, + "grad_norm": 1.826635479927063, + "learning_rate": 5.062249236979442e-06, + "loss": 0.6328, + "step": 3990 + }, + { + "epoch": 0.51, + "grad_norm": 1.3039029836654663, + "learning_rate": 5.060174364325202e-06, + "loss": 0.6261, + "step": 3991 + }, + { + "epoch": 0.51, + "grad_norm": 1.1858707666397095, + "learning_rate": 5.058099481307154e-06, + "loss": 0.5912, + "step": 3992 + }, + { + "epoch": 0.51, + "grad_norm": 1.1001001596450806, + "learning_rate": 5.05602458828265e-06, + "loss": 0.5876, + "step": 3993 + }, + { + "epoch": 0.51, + "grad_norm": 1.4584338665008545, + "learning_rate": 5.053949685609051e-06, + "loss": 0.5939, + "step": 3994 + }, + { + "epoch": 0.51, + "grad_norm": 1.3201522827148438, + "learning_rate": 5.051874773643713e-06, + "loss": 0.654, + "step": 3995 + }, + { + "epoch": 0.51, + "grad_norm": 1.044450044631958, + "learning_rate": 5.049799852744001e-06, + "loss": 0.58, + "step": 3996 + }, + { + "epoch": 0.51, + "grad_norm": 1.269258975982666, + "learning_rate": 5.047724923267277e-06, + "loss": 0.5994, + "step": 3997 + }, + { + "epoch": 0.51, + "grad_norm": 1.2389514446258545, + "learning_rate": 5.045649985570904e-06, + "loss": 0.5532, + "step": 3998 + }, + { + "epoch": 0.51, + "grad_norm": 1.3213895559310913, + "learning_rate": 5.0435750400122485e-06, + "loss": 0.6093, + "step": 3999 + }, + { + "epoch": 0.51, + "grad_norm": 1.4534727334976196, + "learning_rate": 5.041500086948677e-06, + "loss": 0.6433, + "step": 4000 + }, + { + "epoch": 0.51, + "grad_norm": 1.4285792112350464, + "learning_rate": 5.039425126737563e-06, + "loss": 0.6478, + "step": 4001 + }, + { + "epoch": 0.51, + "grad_norm": 1.21049165725708, + "learning_rate": 5.0373501597362685e-06, + "loss": 0.6199, + "step": 4002 + }, + { + "epoch": 0.51, + "grad_norm": 1.1804089546203613, + "learning_rate": 5.03527518630217e-06, + "loss": 0.5829, + "step": 4003 + }, + { + "epoch": 0.51, + "grad_norm": 1.5666120052337646, + "learning_rate": 5.033200206792637e-06, + "loss": 0.616, + "step": 4004 + }, + { + "epoch": 0.51, + "grad_norm": 1.9499365091323853, + "learning_rate": 5.031125221565044e-06, + "loss": 0.6225, + "step": 4005 + }, + { + "epoch": 0.51, + "grad_norm": 2.4018373489379883, + "learning_rate": 5.029050230976763e-06, + "loss": 0.6091, + "step": 4006 + }, + { + "epoch": 0.51, + "grad_norm": 1.8645708560943604, + "learning_rate": 5.026975235385172e-06, + "loss": 0.6104, + "step": 4007 + }, + { + "epoch": 0.51, + "grad_norm": 1.2363499402999878, + "learning_rate": 5.024900235147643e-06, + "loss": 0.6544, + "step": 4008 + }, + { + "epoch": 0.51, + "grad_norm": 1.315367341041565, + "learning_rate": 5.022825230621555e-06, + "loss": 0.6242, + "step": 4009 + }, + { + "epoch": 0.51, + "grad_norm": 1.2281227111816406, + "learning_rate": 5.020750222164286e-06, + "loss": 0.5227, + "step": 4010 + }, + { + "epoch": 0.51, + "grad_norm": 1.3124263286590576, + "learning_rate": 5.0186752101332124e-06, + "loss": 0.6848, + "step": 4011 + }, + { + "epoch": 0.51, + "grad_norm": 1.246238350868225, + "learning_rate": 5.016600194885714e-06, + "loss": 0.5938, + "step": 4012 + }, + { + "epoch": 0.51, + "grad_norm": 1.2616819143295288, + "learning_rate": 5.014525176779168e-06, + "loss": 0.6353, + "step": 4013 + }, + { + "epoch": 0.51, + "grad_norm": 1.8460921049118042, + "learning_rate": 5.012450156170957e-06, + "loss": 0.5932, + "step": 4014 + }, + { + "epoch": 0.51, + "grad_norm": 1.32743501663208, + "learning_rate": 5.0103751334184595e-06, + "loss": 0.6467, + "step": 4015 + }, + { + "epoch": 0.51, + "grad_norm": 1.421920657157898, + "learning_rate": 5.008300108879055e-06, + "loss": 0.521, + "step": 4016 + }, + { + "epoch": 0.51, + "grad_norm": 1.3539751768112183, + "learning_rate": 5.006225082910126e-06, + "loss": 0.5817, + "step": 4017 + }, + { + "epoch": 0.51, + "grad_norm": 1.1650025844573975, + "learning_rate": 5.004150055869053e-06, + "loss": 0.6017, + "step": 4018 + }, + { + "epoch": 0.51, + "grad_norm": 1.121670126914978, + "learning_rate": 5.0020750281132165e-06, + "loss": 0.546, + "step": 4019 + }, + { + "epoch": 0.52, + "grad_norm": 1.4239189624786377, + "learning_rate": 5e-06, + "loss": 0.5849, + "step": 4020 + }, + { + "epoch": 0.52, + "grad_norm": 1.4270631074905396, + "learning_rate": 4.997924971886784e-06, + "loss": 0.5985, + "step": 4021 + }, + { + "epoch": 0.52, + "grad_norm": 1.1389853954315186, + "learning_rate": 4.995849944130948e-06, + "loss": 0.5686, + "step": 4022 + }, + { + "epoch": 0.52, + "grad_norm": 1.3634734153747559, + "learning_rate": 4.993774917089876e-06, + "loss": 0.6108, + "step": 4023 + }, + { + "epoch": 0.52, + "grad_norm": 1.4653452634811401, + "learning_rate": 4.991699891120947e-06, + "loss": 0.5871, + "step": 4024 + }, + { + "epoch": 0.52, + "grad_norm": 1.2894748449325562, + "learning_rate": 4.989624866581544e-06, + "loss": 0.575, + "step": 4025 + }, + { + "epoch": 0.52, + "grad_norm": 1.3853498697280884, + "learning_rate": 4.987549843829045e-06, + "loss": 0.6293, + "step": 4026 + }, + { + "epoch": 0.52, + "grad_norm": 1.2562291622161865, + "learning_rate": 4.985474823220835e-06, + "loss": 0.6276, + "step": 4027 + }, + { + "epoch": 0.52, + "grad_norm": 1.1109954118728638, + "learning_rate": 4.983399805114289e-06, + "loss": 0.5868, + "step": 4028 + }, + { + "epoch": 0.52, + "grad_norm": 1.2795517444610596, + "learning_rate": 4.981324789866788e-06, + "loss": 0.5745, + "step": 4029 + }, + { + "epoch": 0.52, + "grad_norm": 1.9505984783172607, + "learning_rate": 4.979249777835715e-06, + "loss": 0.5709, + "step": 4030 + }, + { + "epoch": 0.52, + "grad_norm": 1.2079765796661377, + "learning_rate": 4.977174769378445e-06, + "loss": 0.6797, + "step": 4031 + }, + { + "epoch": 0.52, + "grad_norm": 1.466506004333496, + "learning_rate": 4.975099764852359e-06, + "loss": 0.6298, + "step": 4032 + }, + { + "epoch": 0.52, + "grad_norm": 1.308029294013977, + "learning_rate": 4.973024764614829e-06, + "loss": 0.5299, + "step": 4033 + }, + { + "epoch": 0.52, + "grad_norm": 4.230959892272949, + "learning_rate": 4.970949769023238e-06, + "loss": 0.6072, + "step": 4034 + }, + { + "epoch": 0.52, + "grad_norm": 1.2437947988510132, + "learning_rate": 4.968874778434957e-06, + "loss": 0.6981, + "step": 4035 + }, + { + "epoch": 0.52, + "grad_norm": 1.1782584190368652, + "learning_rate": 4.966799793207364e-06, + "loss": 0.5982, + "step": 4036 + }, + { + "epoch": 0.52, + "grad_norm": 1.3160803318023682, + "learning_rate": 4.964724813697831e-06, + "loss": 0.6438, + "step": 4037 + }, + { + "epoch": 0.52, + "grad_norm": 1.471976637840271, + "learning_rate": 4.962649840263733e-06, + "loss": 0.5819, + "step": 4038 + }, + { + "epoch": 0.52, + "grad_norm": 1.1718621253967285, + "learning_rate": 4.960574873262439e-06, + "loss": 0.545, + "step": 4039 + }, + { + "epoch": 0.52, + "grad_norm": 1.1031982898712158, + "learning_rate": 4.9584999130513235e-06, + "loss": 0.551, + "step": 4040 + }, + { + "epoch": 0.52, + "grad_norm": 1.2355008125305176, + "learning_rate": 4.956424959987753e-06, + "loss": 0.538, + "step": 4041 + }, + { + "epoch": 0.52, + "grad_norm": 1.5801515579223633, + "learning_rate": 4.954350014429099e-06, + "loss": 0.622, + "step": 4042 + }, + { + "epoch": 0.52, + "grad_norm": 1.3405274152755737, + "learning_rate": 4.952275076732726e-06, + "loss": 0.6235, + "step": 4043 + }, + { + "epoch": 0.52, + "grad_norm": 1.0359275341033936, + "learning_rate": 4.950200147256002e-06, + "loss": 0.743, + "step": 4044 + }, + { + "epoch": 0.52, + "grad_norm": 1.0925586223602295, + "learning_rate": 4.948125226356288e-06, + "loss": 0.662, + "step": 4045 + }, + { + "epoch": 0.52, + "grad_norm": 1.484156847000122, + "learning_rate": 4.94605031439095e-06, + "loss": 0.6197, + "step": 4046 + }, + { + "epoch": 0.52, + "grad_norm": 1.1978555917739868, + "learning_rate": 4.943975411717351e-06, + "loss": 0.6148, + "step": 4047 + }, + { + "epoch": 0.52, + "grad_norm": 1.3440272808074951, + "learning_rate": 4.941900518692846e-06, + "loss": 0.5443, + "step": 4048 + }, + { + "epoch": 0.52, + "grad_norm": 1.8050079345703125, + "learning_rate": 4.939825635674798e-06, + "loss": 0.6654, + "step": 4049 + }, + { + "epoch": 0.52, + "grad_norm": 1.0395833253860474, + "learning_rate": 4.93775076302056e-06, + "loss": 0.6051, + "step": 4050 + }, + { + "epoch": 0.52, + "grad_norm": 2.008521556854248, + "learning_rate": 4.935675901087488e-06, + "loss": 0.6138, + "step": 4051 + }, + { + "epoch": 0.52, + "grad_norm": 2.4628641605377197, + "learning_rate": 4.933601050232935e-06, + "loss": 0.5635, + "step": 4052 + }, + { + "epoch": 0.52, + "grad_norm": 1.2769087553024292, + "learning_rate": 4.931526210814251e-06, + "loss": 0.6141, + "step": 4053 + }, + { + "epoch": 0.52, + "grad_norm": 1.3242411613464355, + "learning_rate": 4.929451383188785e-06, + "loss": 0.6098, + "step": 4054 + }, + { + "epoch": 0.52, + "grad_norm": 1.0932570695877075, + "learning_rate": 4.927376567713886e-06, + "loss": 0.6458, + "step": 4055 + }, + { + "epoch": 0.52, + "grad_norm": 1.3448100090026855, + "learning_rate": 4.925301764746895e-06, + "loss": 0.5961, + "step": 4056 + }, + { + "epoch": 0.52, + "grad_norm": 1.2128664255142212, + "learning_rate": 4.923226974645158e-06, + "loss": 0.5711, + "step": 4057 + }, + { + "epoch": 0.52, + "grad_norm": 1.6496801376342773, + "learning_rate": 4.921152197766014e-06, + "loss": 0.5831, + "step": 4058 + }, + { + "epoch": 0.52, + "grad_norm": 1.2741731405258179, + "learning_rate": 4.919077434466802e-06, + "loss": 0.6547, + "step": 4059 + }, + { + "epoch": 0.52, + "grad_norm": 1.3321490287780762, + "learning_rate": 4.917002685104855e-06, + "loss": 0.5447, + "step": 4060 + }, + { + "epoch": 0.52, + "grad_norm": 1.141733169555664, + "learning_rate": 4.914927950037511e-06, + "loss": 0.5685, + "step": 4061 + }, + { + "epoch": 0.52, + "grad_norm": 1.9314861297607422, + "learning_rate": 4.912853229622096e-06, + "loss": 0.5964, + "step": 4062 + }, + { + "epoch": 0.52, + "grad_norm": 1.4189141988754272, + "learning_rate": 4.910778524215941e-06, + "loss": 0.586, + "step": 4063 + }, + { + "epoch": 0.52, + "grad_norm": 1.2865058183670044, + "learning_rate": 4.908703834176373e-06, + "loss": 0.5715, + "step": 4064 + }, + { + "epoch": 0.52, + "grad_norm": 1.4331640005111694, + "learning_rate": 4.906629159860713e-06, + "loss": 0.6381, + "step": 4065 + }, + { + "epoch": 0.52, + "grad_norm": 1.0844879150390625, + "learning_rate": 4.904554501626284e-06, + "loss": 0.5739, + "step": 4066 + }, + { + "epoch": 0.52, + "grad_norm": 1.2306838035583496, + "learning_rate": 4.9024798598304006e-06, + "loss": 0.7441, + "step": 4067 + }, + { + "epoch": 0.52, + "grad_norm": 1.091158390045166, + "learning_rate": 4.90040523483038e-06, + "loss": 0.6085, + "step": 4068 + }, + { + "epoch": 0.52, + "grad_norm": 1.3969615697860718, + "learning_rate": 4.898330626983533e-06, + "loss": 0.6604, + "step": 4069 + }, + { + "epoch": 0.52, + "grad_norm": 1.0094318389892578, + "learning_rate": 4.89625603664717e-06, + "loss": 0.5522, + "step": 4070 + }, + { + "epoch": 0.52, + "grad_norm": 1.2802209854125977, + "learning_rate": 4.894181464178595e-06, + "loss": 0.6044, + "step": 4071 + }, + { + "epoch": 0.52, + "grad_norm": 1.6502206325531006, + "learning_rate": 4.892106909935111e-06, + "loss": 0.6496, + "step": 4072 + }, + { + "epoch": 0.52, + "grad_norm": 1.3228180408477783, + "learning_rate": 4.890032374274021e-06, + "loss": 0.6288, + "step": 4073 + }, + { + "epoch": 0.52, + "grad_norm": 1.3824211359024048, + "learning_rate": 4.887957857552617e-06, + "loss": 0.6468, + "step": 4074 + }, + { + "epoch": 0.52, + "grad_norm": 0.9041395783424377, + "learning_rate": 4.885883360128197e-06, + "loss": 0.5782, + "step": 4075 + }, + { + "epoch": 0.52, + "grad_norm": 1.6577094793319702, + "learning_rate": 4.883808882358047e-06, + "loss": 0.6067, + "step": 4076 + }, + { + "epoch": 0.52, + "grad_norm": 1.2962143421173096, + "learning_rate": 4.881734424599456e-06, + "loss": 0.6659, + "step": 4077 + }, + { + "epoch": 0.52, + "grad_norm": 1.3377918004989624, + "learning_rate": 4.879659987209707e-06, + "loss": 0.6366, + "step": 4078 + }, + { + "epoch": 0.52, + "grad_norm": 2.1966018676757812, + "learning_rate": 4.877585570546078e-06, + "loss": 0.6419, + "step": 4079 + }, + { + "epoch": 0.52, + "grad_norm": 1.3197449445724487, + "learning_rate": 4.875511174965846e-06, + "loss": 0.6256, + "step": 4080 + }, + { + "epoch": 0.52, + "grad_norm": 1.4747912883758545, + "learning_rate": 4.8734368008262835e-06, + "loss": 0.6736, + "step": 4081 + }, + { + "epoch": 0.52, + "grad_norm": 1.2974356412887573, + "learning_rate": 4.871362448484662e-06, + "loss": 0.5823, + "step": 4082 + }, + { + "epoch": 0.52, + "grad_norm": 1.4338997602462769, + "learning_rate": 4.869288118298242e-06, + "loss": 0.5833, + "step": 4083 + }, + { + "epoch": 0.52, + "grad_norm": 1.3909071683883667, + "learning_rate": 4.867213810624288e-06, + "loss": 0.5486, + "step": 4084 + }, + { + "epoch": 0.52, + "grad_norm": 1.427019476890564, + "learning_rate": 4.865139525820055e-06, + "loss": 0.555, + "step": 4085 + }, + { + "epoch": 0.52, + "grad_norm": 1.493001937866211, + "learning_rate": 4.8630652642428e-06, + "loss": 0.6544, + "step": 4086 + }, + { + "epoch": 0.52, + "grad_norm": 1.26246178150177, + "learning_rate": 4.860991026249768e-06, + "loss": 0.6591, + "step": 4087 + }, + { + "epoch": 0.52, + "grad_norm": 1.4320074319839478, + "learning_rate": 4.858916812198206e-06, + "loss": 0.6442, + "step": 4088 + }, + { + "epoch": 0.52, + "grad_norm": 1.3701486587524414, + "learning_rate": 4.856842622445356e-06, + "loss": 0.5566, + "step": 4089 + }, + { + "epoch": 0.52, + "grad_norm": 1.2770544290542603, + "learning_rate": 4.854768457348456e-06, + "loss": 0.6557, + "step": 4090 + }, + { + "epoch": 0.52, + "grad_norm": 1.5645573139190674, + "learning_rate": 4.852694317264735e-06, + "loss": 0.6767, + "step": 4091 + }, + { + "epoch": 0.52, + "grad_norm": 1.349003791809082, + "learning_rate": 4.850620202551425e-06, + "loss": 0.6173, + "step": 4092 + }, + { + "epoch": 0.52, + "grad_norm": 1.3395682573318481, + "learning_rate": 4.848546113565748e-06, + "loss": 0.5781, + "step": 4093 + }, + { + "epoch": 0.52, + "grad_norm": 1.317745566368103, + "learning_rate": 4.846472050664925e-06, + "loss": 0.6592, + "step": 4094 + }, + { + "epoch": 0.52, + "grad_norm": 4.009605884552002, + "learning_rate": 4.84439801420617e-06, + "loss": 0.5211, + "step": 4095 + }, + { + "epoch": 0.52, + "grad_norm": 1.3194994926452637, + "learning_rate": 4.842324004546696e-06, + "loss": 0.6196, + "step": 4096 + }, + { + "epoch": 0.52, + "grad_norm": 1.368104338645935, + "learning_rate": 4.8402500220437054e-06, + "loss": 0.6548, + "step": 4097 + }, + { + "epoch": 0.53, + "grad_norm": 1.1761075258255005, + "learning_rate": 4.838176067054401e-06, + "loss": 0.6217, + "step": 4098 + }, + { + "epoch": 0.53, + "grad_norm": 1.5087071657180786, + "learning_rate": 4.836102139935982e-06, + "loss": 0.5141, + "step": 4099 + }, + { + "epoch": 0.53, + "grad_norm": 1.0981336832046509, + "learning_rate": 4.8340282410456365e-06, + "loss": 0.5627, + "step": 4100 + }, + { + "epoch": 0.53, + "grad_norm": 1.1873528957366943, + "learning_rate": 4.831954370740554e-06, + "loss": 0.6115, + "step": 4101 + }, + { + "epoch": 0.53, + "grad_norm": 1.1071884632110596, + "learning_rate": 4.829880529377915e-06, + "loss": 0.5372, + "step": 4102 + }, + { + "epoch": 0.53, + "grad_norm": 1.2068363428115845, + "learning_rate": 4.8278067173148975e-06, + "loss": 0.6624, + "step": 4103 + }, + { + "epoch": 0.53, + "grad_norm": 1.3392771482467651, + "learning_rate": 4.825732934908672e-06, + "loss": 0.6454, + "step": 4104 + }, + { + "epoch": 0.53, + "grad_norm": 3.0413925647735596, + "learning_rate": 4.8236591825164085e-06, + "loss": 0.6027, + "step": 4105 + }, + { + "epoch": 0.53, + "grad_norm": 1.4464915990829468, + "learning_rate": 4.821585460495264e-06, + "loss": 0.6546, + "step": 4106 + }, + { + "epoch": 0.53, + "grad_norm": 1.405840277671814, + "learning_rate": 4.819511769202399e-06, + "loss": 0.5513, + "step": 4107 + }, + { + "epoch": 0.53, + "grad_norm": 1.2467281818389893, + "learning_rate": 4.817438108994963e-06, + "loss": 0.5584, + "step": 4108 + }, + { + "epoch": 0.53, + "grad_norm": 1.293278455734253, + "learning_rate": 4.815364480230103e-06, + "loss": 0.6054, + "step": 4109 + }, + { + "epoch": 0.53, + "grad_norm": 1.4756916761398315, + "learning_rate": 4.813290883264956e-06, + "loss": 0.6539, + "step": 4110 + }, + { + "epoch": 0.53, + "grad_norm": 1.106099009513855, + "learning_rate": 4.811217318456661e-06, + "loss": 0.7242, + "step": 4111 + }, + { + "epoch": 0.53, + "grad_norm": 1.1641430854797363, + "learning_rate": 4.809143786162345e-06, + "loss": 0.605, + "step": 4112 + }, + { + "epoch": 0.53, + "grad_norm": 1.113264799118042, + "learning_rate": 4.807070286739134e-06, + "loss": 0.753, + "step": 4113 + }, + { + "epoch": 0.53, + "grad_norm": 1.1319680213928223, + "learning_rate": 4.804996820544144e-06, + "loss": 0.7126, + "step": 4114 + }, + { + "epoch": 0.53, + "grad_norm": 1.2857635021209717, + "learning_rate": 4.8029233879344845e-06, + "loss": 0.6106, + "step": 4115 + }, + { + "epoch": 0.53, + "grad_norm": 1.3838752508163452, + "learning_rate": 4.800849989267269e-06, + "loss": 0.5595, + "step": 4116 + }, + { + "epoch": 0.53, + "grad_norm": 6.1764984130859375, + "learning_rate": 4.798776624899595e-06, + "loss": 0.5441, + "step": 4117 + }, + { + "epoch": 0.53, + "grad_norm": 1.6189913749694824, + "learning_rate": 4.796703295188557e-06, + "loss": 0.6184, + "step": 4118 + }, + { + "epoch": 0.53, + "grad_norm": 2.234065532684326, + "learning_rate": 4.7946300004912454e-06, + "loss": 0.671, + "step": 4119 + }, + { + "epoch": 0.53, + "grad_norm": 1.2225993871688843, + "learning_rate": 4.7925567411647405e-06, + "loss": 0.6519, + "step": 4120 + }, + { + "epoch": 0.53, + "grad_norm": 1.2532271146774292, + "learning_rate": 4.790483517566122e-06, + "loss": 0.5927, + "step": 4121 + }, + { + "epoch": 0.53, + "grad_norm": 1.2144299745559692, + "learning_rate": 4.788410330052457e-06, + "loss": 0.724, + "step": 4122 + }, + { + "epoch": 0.53, + "grad_norm": 1.2604823112487793, + "learning_rate": 4.7863371789808135e-06, + "loss": 0.5651, + "step": 4123 + }, + { + "epoch": 0.53, + "grad_norm": 1.168423056602478, + "learning_rate": 4.784264064708247e-06, + "loss": 0.5987, + "step": 4124 + }, + { + "epoch": 0.53, + "grad_norm": 1.171149730682373, + "learning_rate": 4.782190987591811e-06, + "loss": 0.6428, + "step": 4125 + }, + { + "epoch": 0.53, + "grad_norm": 1.292298674583435, + "learning_rate": 4.7801179479885495e-06, + "loss": 0.6348, + "step": 4126 + }, + { + "epoch": 0.53, + "grad_norm": 1.250784993171692, + "learning_rate": 4.778044946255503e-06, + "loss": 0.5781, + "step": 4127 + }, + { + "epoch": 0.53, + "grad_norm": 1.4512914419174194, + "learning_rate": 4.775971982749703e-06, + "loss": 0.5359, + "step": 4128 + }, + { + "epoch": 0.53, + "grad_norm": 1.359748363494873, + "learning_rate": 4.773899057828176e-06, + "loss": 0.5784, + "step": 4129 + }, + { + "epoch": 0.53, + "grad_norm": 1.3749535083770752, + "learning_rate": 4.771826171847939e-06, + "loss": 0.6338, + "step": 4130 + }, + { + "epoch": 0.53, + "grad_norm": 1.980396032333374, + "learning_rate": 4.769753325166008e-06, + "loss": 0.6662, + "step": 4131 + }, + { + "epoch": 0.53, + "grad_norm": 1.5456019639968872, + "learning_rate": 4.7676805181393835e-06, + "loss": 0.6045, + "step": 4132 + }, + { + "epoch": 0.53, + "grad_norm": 3.6582868099212646, + "learning_rate": 4.76560775112507e-06, + "loss": 0.5481, + "step": 4133 + }, + { + "epoch": 0.53, + "grad_norm": 1.1082854270935059, + "learning_rate": 4.763535024480057e-06, + "loss": 0.6318, + "step": 4134 + }, + { + "epoch": 0.53, + "grad_norm": 2.2785942554473877, + "learning_rate": 4.761462338561329e-06, + "loss": 0.611, + "step": 4135 + }, + { + "epoch": 0.53, + "grad_norm": 1.0957773923873901, + "learning_rate": 4.759389693725867e-06, + "loss": 0.706, + "step": 4136 + }, + { + "epoch": 0.53, + "grad_norm": 0.9537548422813416, + "learning_rate": 4.757317090330638e-06, + "loss": 0.5881, + "step": 4137 + }, + { + "epoch": 0.53, + "grad_norm": 1.0327272415161133, + "learning_rate": 4.755244528732608e-06, + "loss": 0.5027, + "step": 4138 + }, + { + "epoch": 0.53, + "grad_norm": 1.2874075174331665, + "learning_rate": 4.753172009288732e-06, + "loss": 0.6377, + "step": 4139 + }, + { + "epoch": 0.53, + "grad_norm": 1.2741062641143799, + "learning_rate": 4.751099532355962e-06, + "loss": 0.6127, + "step": 4140 + }, + { + "epoch": 0.53, + "grad_norm": 1.2608203887939453, + "learning_rate": 4.749027098291237e-06, + "loss": 0.583, + "step": 4141 + }, + { + "epoch": 0.53, + "grad_norm": 1.517383337020874, + "learning_rate": 4.7469547074514946e-06, + "loss": 0.5961, + "step": 4142 + }, + { + "epoch": 0.53, + "grad_norm": 1.231284499168396, + "learning_rate": 4.7448823601936585e-06, + "loss": 0.583, + "step": 4143 + }, + { + "epoch": 0.53, + "grad_norm": 1.1564208269119263, + "learning_rate": 4.742810056874652e-06, + "loss": 0.6192, + "step": 4144 + }, + { + "epoch": 0.53, + "grad_norm": 1.2633081674575806, + "learning_rate": 4.740737797851385e-06, + "loss": 0.5658, + "step": 4145 + }, + { + "epoch": 0.53, + "grad_norm": 1.1155922412872314, + "learning_rate": 4.7386655834807634e-06, + "loss": 0.5894, + "step": 4146 + }, + { + "epoch": 0.53, + "grad_norm": 1.3662333488464355, + "learning_rate": 4.736593414119682e-06, + "loss": 0.6027, + "step": 4147 + }, + { + "epoch": 0.53, + "grad_norm": 1.2502559423446655, + "learning_rate": 4.734521290125032e-06, + "loss": 0.5672, + "step": 4148 + }, + { + "epoch": 0.53, + "grad_norm": 1.3026888370513916, + "learning_rate": 4.732449211853693e-06, + "loss": 0.5624, + "step": 4149 + }, + { + "epoch": 0.53, + "grad_norm": 1.1376659870147705, + "learning_rate": 4.730377179662538e-06, + "loss": 0.5888, + "step": 4150 + }, + { + "epoch": 0.53, + "grad_norm": 1.2101504802703857, + "learning_rate": 4.728305193908436e-06, + "loss": 0.6549, + "step": 4151 + }, + { + "epoch": 0.53, + "grad_norm": 1.048017144203186, + "learning_rate": 4.72623325494824e-06, + "loss": 0.6031, + "step": 4152 + }, + { + "epoch": 0.53, + "grad_norm": 1.2536982297897339, + "learning_rate": 4.7241613631388034e-06, + "loss": 0.5735, + "step": 4153 + }, + { + "epoch": 0.53, + "grad_norm": 1.2073613405227661, + "learning_rate": 4.722089518836964e-06, + "loss": 0.6656, + "step": 4154 + }, + { + "epoch": 0.53, + "grad_norm": 1.3333324193954468, + "learning_rate": 4.720017722399557e-06, + "loss": 0.6033, + "step": 4155 + }, + { + "epoch": 0.53, + "grad_norm": 1.8105237483978271, + "learning_rate": 4.717945974183405e-06, + "loss": 0.4994, + "step": 4156 + }, + { + "epoch": 0.53, + "grad_norm": 1.501945972442627, + "learning_rate": 4.715874274545328e-06, + "loss": 0.5396, + "step": 4157 + }, + { + "epoch": 0.53, + "grad_norm": 1.8984172344207764, + "learning_rate": 4.71380262384213e-06, + "loss": 0.549, + "step": 4158 + }, + { + "epoch": 0.53, + "grad_norm": 1.550038456916809, + "learning_rate": 4.711731022430615e-06, + "loss": 0.5852, + "step": 4159 + }, + { + "epoch": 0.53, + "grad_norm": 1.7350627183914185, + "learning_rate": 4.70965947066757e-06, + "loss": 0.6284, + "step": 4160 + }, + { + "epoch": 0.53, + "grad_norm": 1.5158087015151978, + "learning_rate": 4.707587968909782e-06, + "loss": 0.5684, + "step": 4161 + }, + { + "epoch": 0.53, + "grad_norm": 1.1972614526748657, + "learning_rate": 4.705516517514021e-06, + "loss": 0.5638, + "step": 4162 + }, + { + "epoch": 0.53, + "grad_norm": 1.1058855056762695, + "learning_rate": 4.703445116837055e-06, + "loss": 0.5002, + "step": 4163 + }, + { + "epoch": 0.53, + "grad_norm": 1.3835372924804688, + "learning_rate": 4.701373767235641e-06, + "loss": 0.6697, + "step": 4164 + }, + { + "epoch": 0.53, + "grad_norm": 1.174512267112732, + "learning_rate": 4.699302469066524e-06, + "loss": 0.6428, + "step": 4165 + }, + { + "epoch": 0.53, + "grad_norm": 1.310792326927185, + "learning_rate": 4.6972312226864445e-06, + "loss": 0.652, + "step": 4166 + }, + { + "epoch": 0.53, + "grad_norm": 1.3059464693069458, + "learning_rate": 4.6951600284521324e-06, + "loss": 0.5807, + "step": 4167 + }, + { + "epoch": 0.53, + "grad_norm": 1.334140658378601, + "learning_rate": 4.6930888867203115e-06, + "loss": 0.6129, + "step": 4168 + }, + { + "epoch": 0.53, + "grad_norm": 1.4384865760803223, + "learning_rate": 4.691017797847692e-06, + "loss": 0.6122, + "step": 4169 + }, + { + "epoch": 0.53, + "grad_norm": 1.3629262447357178, + "learning_rate": 4.688946762190975e-06, + "loss": 0.6201, + "step": 4170 + }, + { + "epoch": 0.53, + "grad_norm": 1.473082423210144, + "learning_rate": 4.686875780106856e-06, + "loss": 0.5797, + "step": 4171 + }, + { + "epoch": 0.53, + "grad_norm": 1.335864782333374, + "learning_rate": 4.68480485195202e-06, + "loss": 0.6164, + "step": 4172 + }, + { + "epoch": 0.53, + "grad_norm": 2.127086639404297, + "learning_rate": 4.682733978083142e-06, + "loss": 0.6167, + "step": 4173 + }, + { + "epoch": 0.53, + "grad_norm": 1.2139432430267334, + "learning_rate": 4.680663158856886e-06, + "loss": 0.6562, + "step": 4174 + }, + { + "epoch": 0.53, + "grad_norm": 1.4341139793395996, + "learning_rate": 4.678592394629912e-06, + "loss": 0.6285, + "step": 4175 + }, + { + "epoch": 0.54, + "grad_norm": 1.3252298831939697, + "learning_rate": 4.676521685758863e-06, + "loss": 0.6012, + "step": 4176 + }, + { + "epoch": 0.54, + "grad_norm": 1.3366296291351318, + "learning_rate": 4.6744510326003805e-06, + "loss": 0.6053, + "step": 4177 + }, + { + "epoch": 0.54, + "grad_norm": 1.302324652671814, + "learning_rate": 4.672380435511089e-06, + "loss": 0.6217, + "step": 4178 + }, + { + "epoch": 0.54, + "grad_norm": 1.673362374305725, + "learning_rate": 4.67030989484761e-06, + "loss": 0.6395, + "step": 4179 + }, + { + "epoch": 0.54, + "grad_norm": 1.6342353820800781, + "learning_rate": 4.668239410966549e-06, + "loss": 0.5935, + "step": 4180 + }, + { + "epoch": 0.54, + "grad_norm": 1.6448659896850586, + "learning_rate": 4.666168984224508e-06, + "loss": 0.555, + "step": 4181 + }, + { + "epoch": 0.54, + "grad_norm": 1.3737086057662964, + "learning_rate": 4.664098614978073e-06, + "loss": 0.5609, + "step": 4182 + }, + { + "epoch": 0.54, + "grad_norm": 1.202628254890442, + "learning_rate": 4.662028303583823e-06, + "loss": 0.6397, + "step": 4183 + }, + { + "epoch": 0.54, + "grad_norm": 1.1492021083831787, + "learning_rate": 4.6599580503983295e-06, + "loss": 0.6714, + "step": 4184 + }, + { + "epoch": 0.54, + "grad_norm": 1.1646534204483032, + "learning_rate": 4.657887855778149e-06, + "loss": 0.6116, + "step": 4185 + }, + { + "epoch": 0.54, + "grad_norm": 1.2870683670043945, + "learning_rate": 4.655817720079834e-06, + "loss": 0.5639, + "step": 4186 + }, + { + "epoch": 0.54, + "grad_norm": 2.9178974628448486, + "learning_rate": 4.6537476436599184e-06, + "loss": 0.6177, + "step": 4187 + }, + { + "epoch": 0.54, + "grad_norm": 1.0415689945220947, + "learning_rate": 4.651677626874936e-06, + "loss": 0.5541, + "step": 4188 + }, + { + "epoch": 0.54, + "grad_norm": 1.0987578630447388, + "learning_rate": 4.6496076700814e-06, + "loss": 0.6257, + "step": 4189 + }, + { + "epoch": 0.54, + "grad_norm": 1.4872864484786987, + "learning_rate": 4.647537773635823e-06, + "loss": 0.6029, + "step": 4190 + }, + { + "epoch": 0.54, + "grad_norm": 1.2052521705627441, + "learning_rate": 4.645467937894699e-06, + "loss": 0.6159, + "step": 4191 + }, + { + "epoch": 0.54, + "grad_norm": 1.0160893201828003, + "learning_rate": 4.643398163214517e-06, + "loss": 0.6487, + "step": 4192 + }, + { + "epoch": 0.54, + "grad_norm": 1.498758316040039, + "learning_rate": 4.641328449951753e-06, + "loss": 0.5912, + "step": 4193 + }, + { + "epoch": 0.54, + "grad_norm": 1.1858739852905273, + "learning_rate": 4.6392587984628735e-06, + "loss": 0.7092, + "step": 4194 + }, + { + "epoch": 0.54, + "grad_norm": 1.1768958568572998, + "learning_rate": 4.637189209104333e-06, + "loss": 0.5921, + "step": 4195 + }, + { + "epoch": 0.54, + "grad_norm": 1.3898215293884277, + "learning_rate": 4.635119682232577e-06, + "loss": 0.6398, + "step": 4196 + }, + { + "epoch": 0.54, + "grad_norm": 1.3137660026550293, + "learning_rate": 4.63305021820404e-06, + "loss": 0.6571, + "step": 4197 + }, + { + "epoch": 0.54, + "grad_norm": 1.1577122211456299, + "learning_rate": 4.6309808173751445e-06, + "loss": 0.7469, + "step": 4198 + }, + { + "epoch": 0.54, + "grad_norm": 1.4007368087768555, + "learning_rate": 4.628911480102301e-06, + "loss": 0.6021, + "step": 4199 + }, + { + "epoch": 0.54, + "grad_norm": 1.1606431007385254, + "learning_rate": 4.626842206741912e-06, + "loss": 0.6898, + "step": 4200 + }, + { + "epoch": 0.54, + "grad_norm": 1.4867026805877686, + "learning_rate": 4.62477299765037e-06, + "loss": 0.6388, + "step": 4201 + }, + { + "epoch": 0.54, + "grad_norm": 1.663015365600586, + "learning_rate": 4.622703853184052e-06, + "loss": 0.5711, + "step": 4202 + }, + { + "epoch": 0.54, + "grad_norm": 1.1145719289779663, + "learning_rate": 4.620634773699327e-06, + "loss": 0.603, + "step": 4203 + }, + { + "epoch": 0.54, + "grad_norm": 1.4312100410461426, + "learning_rate": 4.61856575955255e-06, + "loss": 0.6286, + "step": 4204 + }, + { + "epoch": 0.54, + "grad_norm": 1.1849942207336426, + "learning_rate": 4.6164968111000695e-06, + "loss": 0.6667, + "step": 4205 + }, + { + "epoch": 0.54, + "grad_norm": 1.0929937362670898, + "learning_rate": 4.614427928698217e-06, + "loss": 0.6663, + "step": 4206 + }, + { + "epoch": 0.54, + "grad_norm": 0.972935140132904, + "learning_rate": 4.612359112703318e-06, + "loss": 0.5812, + "step": 4207 + }, + { + "epoch": 0.54, + "grad_norm": 1.4837443828582764, + "learning_rate": 4.610290363471681e-06, + "loss": 0.6155, + "step": 4208 + }, + { + "epoch": 0.54, + "grad_norm": 1.6648123264312744, + "learning_rate": 4.608221681359609e-06, + "loss": 0.5247, + "step": 4209 + }, + { + "epoch": 0.54, + "grad_norm": 1.3424453735351562, + "learning_rate": 4.606153066723389e-06, + "loss": 0.6489, + "step": 4210 + }, + { + "epoch": 0.54, + "grad_norm": 1.4238646030426025, + "learning_rate": 4.604084519919298e-06, + "loss": 0.5952, + "step": 4211 + }, + { + "epoch": 0.54, + "grad_norm": 1.4087281227111816, + "learning_rate": 4.602016041303601e-06, + "loss": 0.6647, + "step": 4212 + }, + { + "epoch": 0.54, + "grad_norm": 1.5249015092849731, + "learning_rate": 4.599947631232552e-06, + "loss": 0.6362, + "step": 4213 + }, + { + "epoch": 0.54, + "grad_norm": 1.3257591724395752, + "learning_rate": 4.597879290062393e-06, + "loss": 0.5909, + "step": 4214 + }, + { + "epoch": 0.54, + "grad_norm": 1.1432000398635864, + "learning_rate": 4.595811018149351e-06, + "loss": 0.5854, + "step": 4215 + }, + { + "epoch": 0.54, + "grad_norm": 1.2500845193862915, + "learning_rate": 4.5937428158496475e-06, + "loss": 0.5857, + "step": 4216 + }, + { + "epoch": 0.54, + "grad_norm": 1.0443065166473389, + "learning_rate": 4.591674683519483e-06, + "loss": 0.6195, + "step": 4217 + }, + { + "epoch": 0.54, + "grad_norm": 1.2430527210235596, + "learning_rate": 4.589606621515057e-06, + "loss": 0.5721, + "step": 4218 + }, + { + "epoch": 0.54, + "grad_norm": 1.3105758428573608, + "learning_rate": 4.5875386301925495e-06, + "loss": 0.4901, + "step": 4219 + }, + { + "epoch": 0.54, + "grad_norm": 1.6490211486816406, + "learning_rate": 4.5854707099081285e-06, + "loss": 0.6874, + "step": 4220 + }, + { + "epoch": 0.54, + "grad_norm": 1.3691102266311646, + "learning_rate": 4.583402861017953e-06, + "loss": 0.594, + "step": 4221 + }, + { + "epoch": 0.54, + "grad_norm": 1.2332993745803833, + "learning_rate": 4.5813350838781665e-06, + "loss": 0.6166, + "step": 4222 + }, + { + "epoch": 0.54, + "grad_norm": 1.3578057289123535, + "learning_rate": 4.579267378844902e-06, + "loss": 0.6297, + "step": 4223 + }, + { + "epoch": 0.54, + "grad_norm": 1.7097007036209106, + "learning_rate": 4.577199746274279e-06, + "loss": 0.6119, + "step": 4224 + }, + { + "epoch": 0.54, + "grad_norm": 1.160226583480835, + "learning_rate": 4.575132186522408e-06, + "loss": 0.5266, + "step": 4225 + }, + { + "epoch": 0.54, + "grad_norm": 1.177698016166687, + "learning_rate": 4.5730646999453805e-06, + "loss": 0.6148, + "step": 4226 + }, + { + "epoch": 0.54, + "grad_norm": 1.4060474634170532, + "learning_rate": 4.570997286899282e-06, + "loss": 0.5874, + "step": 4227 + }, + { + "epoch": 0.54, + "grad_norm": 1.2073032855987549, + "learning_rate": 4.56892994774018e-06, + "loss": 0.5137, + "step": 4228 + }, + { + "epoch": 0.54, + "grad_norm": 1.2490034103393555, + "learning_rate": 4.566862682824133e-06, + "loss": 0.58, + "step": 4229 + }, + { + "epoch": 0.54, + "grad_norm": 1.3035485744476318, + "learning_rate": 4.564795492507184e-06, + "loss": 0.6719, + "step": 4230 + }, + { + "epoch": 0.54, + "grad_norm": 1.219054102897644, + "learning_rate": 4.562728377145367e-06, + "loss": 0.6331, + "step": 4231 + }, + { + "epoch": 0.54, + "grad_norm": 1.202539324760437, + "learning_rate": 4.560661337094698e-06, + "loss": 0.5764, + "step": 4232 + }, + { + "epoch": 0.54, + "grad_norm": 1.1667770147323608, + "learning_rate": 4.558594372711185e-06, + "loss": 0.5885, + "step": 4233 + }, + { + "epoch": 0.54, + "grad_norm": 1.0304358005523682, + "learning_rate": 4.556527484350819e-06, + "loss": 0.5146, + "step": 4234 + }, + { + "epoch": 0.54, + "grad_norm": 1.0976370573043823, + "learning_rate": 4.554460672369578e-06, + "loss": 0.574, + "step": 4235 + }, + { + "epoch": 0.54, + "grad_norm": 1.336747646331787, + "learning_rate": 4.552393937123432e-06, + "loss": 0.5512, + "step": 4236 + }, + { + "epoch": 0.54, + "grad_norm": 1.2262814044952393, + "learning_rate": 4.550327278968333e-06, + "loss": 0.5616, + "step": 4237 + }, + { + "epoch": 0.54, + "grad_norm": 1.2212550640106201, + "learning_rate": 4.548260698260219e-06, + "loss": 0.6184, + "step": 4238 + }, + { + "epoch": 0.54, + "grad_norm": 1.4734658002853394, + "learning_rate": 4.546194195355018e-06, + "loss": 0.5742, + "step": 4239 + }, + { + "epoch": 0.54, + "grad_norm": 1.4658452272415161, + "learning_rate": 4.544127770608644e-06, + "loss": 0.6211, + "step": 4240 + }, + { + "epoch": 0.54, + "grad_norm": 1.2137492895126343, + "learning_rate": 4.542061424376995e-06, + "loss": 0.5261, + "step": 4241 + }, + { + "epoch": 0.54, + "grad_norm": 1.2533836364746094, + "learning_rate": 4.539995157015957e-06, + "loss": 0.6744, + "step": 4242 + }, + { + "epoch": 0.54, + "grad_norm": 1.436880111694336, + "learning_rate": 4.537928968881404e-06, + "loss": 0.6731, + "step": 4243 + }, + { + "epoch": 0.54, + "grad_norm": 1.1234179735183716, + "learning_rate": 4.535862860329195e-06, + "loss": 0.5358, + "step": 4244 + }, + { + "epoch": 0.54, + "grad_norm": 1.4436432123184204, + "learning_rate": 4.533796831715172e-06, + "loss": 0.6199, + "step": 4245 + }, + { + "epoch": 0.54, + "grad_norm": 1.6011847257614136, + "learning_rate": 4.531730883395171e-06, + "loss": 0.5899, + "step": 4246 + }, + { + "epoch": 0.54, + "grad_norm": 1.1307777166366577, + "learning_rate": 4.529665015725006e-06, + "loss": 0.6098, + "step": 4247 + }, + { + "epoch": 0.54, + "grad_norm": 1.4225749969482422, + "learning_rate": 4.527599229060483e-06, + "loss": 0.6267, + "step": 4248 + }, + { + "epoch": 0.54, + "grad_norm": 1.1333550214767456, + "learning_rate": 4.5255335237573905e-06, + "loss": 0.6978, + "step": 4249 + }, + { + "epoch": 0.54, + "grad_norm": 1.0923420190811157, + "learning_rate": 4.5234679001715055e-06, + "loss": 0.6762, + "step": 4250 + }, + { + "epoch": 0.54, + "grad_norm": 1.4398398399353027, + "learning_rate": 4.521402358658587e-06, + "loss": 0.684, + "step": 4251 + }, + { + "epoch": 0.54, + "grad_norm": 1.3944870233535767, + "learning_rate": 4.519336899574384e-06, + "loss": 0.6649, + "step": 4252 + }, + { + "epoch": 0.54, + "grad_norm": 1.357353925704956, + "learning_rate": 4.517271523274632e-06, + "loss": 0.5606, + "step": 4253 + }, + { + "epoch": 0.55, + "grad_norm": 1.7038229703903198, + "learning_rate": 4.515206230115047e-06, + "loss": 0.6472, + "step": 4254 + }, + { + "epoch": 0.55, + "grad_norm": 1.4553661346435547, + "learning_rate": 4.5131410204513375e-06, + "loss": 0.5576, + "step": 4255 + }, + { + "epoch": 0.55, + "grad_norm": 1.0828001499176025, + "learning_rate": 4.511075894639189e-06, + "loss": 0.6354, + "step": 4256 + }, + { + "epoch": 0.55, + "grad_norm": 1.3921860456466675, + "learning_rate": 4.509010853034281e-06, + "loss": 0.5799, + "step": 4257 + }, + { + "epoch": 0.55, + "grad_norm": 1.2910650968551636, + "learning_rate": 4.506945895992274e-06, + "loss": 0.6111, + "step": 4258 + }, + { + "epoch": 0.55, + "grad_norm": 1.2620890140533447, + "learning_rate": 4.5048810238688145e-06, + "loss": 0.7103, + "step": 4259 + }, + { + "epoch": 0.55, + "grad_norm": 1.2186883687973022, + "learning_rate": 4.502816237019534e-06, + "loss": 0.6864, + "step": 4260 + }, + { + "epoch": 0.55, + "grad_norm": 1.271498680114746, + "learning_rate": 4.5007515358000525e-06, + "loss": 0.5766, + "step": 4261 + }, + { + "epoch": 0.55, + "grad_norm": 1.3539847135543823, + "learning_rate": 4.498686920565972e-06, + "loss": 0.6176, + "step": 4262 + }, + { + "epoch": 0.55, + "grad_norm": 1.1289831399917603, + "learning_rate": 4.496622391672878e-06, + "loss": 0.5878, + "step": 4263 + }, + { + "epoch": 0.55, + "grad_norm": 1.1196657419204712, + "learning_rate": 4.494557949476347e-06, + "loss": 0.5116, + "step": 4264 + }, + { + "epoch": 0.55, + "grad_norm": 1.3079795837402344, + "learning_rate": 4.492493594331934e-06, + "loss": 0.524, + "step": 4265 + }, + { + "epoch": 0.55, + "grad_norm": 1.2875686883926392, + "learning_rate": 4.490429326595185e-06, + "loss": 0.5328, + "step": 4266 + }, + { + "epoch": 0.55, + "grad_norm": 1.1164149045944214, + "learning_rate": 4.488365146621626e-06, + "loss": 0.6377, + "step": 4267 + }, + { + "epoch": 0.55, + "grad_norm": 1.228834867477417, + "learning_rate": 4.486301054766773e-06, + "loss": 0.5413, + "step": 4268 + }, + { + "epoch": 0.55, + "grad_norm": 2.0576958656311035, + "learning_rate": 4.484237051386119e-06, + "loss": 0.6451, + "step": 4269 + }, + { + "epoch": 0.55, + "grad_norm": 1.1337889432907104, + "learning_rate": 4.482173136835152e-06, + "loss": 0.68, + "step": 4270 + }, + { + "epoch": 0.55, + "grad_norm": 1.1818243265151978, + "learning_rate": 4.480109311469336e-06, + "loss": 0.5929, + "step": 4271 + }, + { + "epoch": 0.55, + "grad_norm": 2.773174524307251, + "learning_rate": 4.4780455756441245e-06, + "loss": 0.5298, + "step": 4272 + }, + { + "epoch": 0.55, + "grad_norm": 1.5445469617843628, + "learning_rate": 4.475981929714953e-06, + "loss": 0.5569, + "step": 4273 + }, + { + "epoch": 0.55, + "grad_norm": 1.3637791872024536, + "learning_rate": 4.473918374037244e-06, + "loss": 0.6169, + "step": 4274 + }, + { + "epoch": 0.55, + "grad_norm": 1.1337825059890747, + "learning_rate": 4.471854908966402e-06, + "loss": 0.5918, + "step": 4275 + }, + { + "epoch": 0.55, + "grad_norm": 1.2219849824905396, + "learning_rate": 4.469791534857816e-06, + "loss": 0.5439, + "step": 4276 + }, + { + "epoch": 0.55, + "grad_norm": 2.176208734512329, + "learning_rate": 4.467728252066862e-06, + "loss": 0.6635, + "step": 4277 + }, + { + "epoch": 0.55, + "grad_norm": 1.3340530395507812, + "learning_rate": 4.465665060948897e-06, + "loss": 0.5424, + "step": 4278 + }, + { + "epoch": 0.55, + "grad_norm": 1.166491985321045, + "learning_rate": 4.4636019618592655e-06, + "loss": 0.665, + "step": 4279 + }, + { + "epoch": 0.55, + "grad_norm": 1.9526617527008057, + "learning_rate": 4.461538955153292e-06, + "loss": 0.6259, + "step": 4280 + }, + { + "epoch": 0.55, + "grad_norm": 1.2087116241455078, + "learning_rate": 4.4594760411862905e-06, + "loss": 0.7804, + "step": 4281 + }, + { + "epoch": 0.55, + "grad_norm": 1.6400904655456543, + "learning_rate": 4.457413220313553e-06, + "loss": 0.6266, + "step": 4282 + }, + { + "epoch": 0.55, + "grad_norm": 1.6623104810714722, + "learning_rate": 4.455350492890361e-06, + "loss": 0.6399, + "step": 4283 + }, + { + "epoch": 0.55, + "grad_norm": 1.3035902976989746, + "learning_rate": 4.453287859271975e-06, + "loss": 0.5671, + "step": 4284 + }, + { + "epoch": 0.55, + "grad_norm": 1.1311049461364746, + "learning_rate": 4.451225319813644e-06, + "loss": 0.5505, + "step": 4285 + }, + { + "epoch": 0.55, + "grad_norm": 1.258715033531189, + "learning_rate": 4.449162874870595e-06, + "loss": 0.62, + "step": 4286 + }, + { + "epoch": 0.55, + "grad_norm": 1.3356287479400635, + "learning_rate": 4.4471005247980464e-06, + "loss": 0.6583, + "step": 4287 + }, + { + "epoch": 0.55, + "grad_norm": 1.2244027853012085, + "learning_rate": 4.445038269951195e-06, + "loss": 0.5537, + "step": 4288 + }, + { + "epoch": 0.55, + "grad_norm": 1.3109939098358154, + "learning_rate": 4.4429761106852204e-06, + "loss": 0.5879, + "step": 4289 + }, + { + "epoch": 0.55, + "grad_norm": 1.1747775077819824, + "learning_rate": 4.44091404735529e-06, + "loss": 0.5629, + "step": 4290 + }, + { + "epoch": 0.55, + "grad_norm": 1.4858828783035278, + "learning_rate": 4.4388520803165495e-06, + "loss": 0.6296, + "step": 4291 + }, + { + "epoch": 0.55, + "grad_norm": 2.4959919452667236, + "learning_rate": 4.436790209924134e-06, + "loss": 0.6197, + "step": 4292 + }, + { + "epoch": 0.55, + "grad_norm": 1.1516481637954712, + "learning_rate": 4.434728436533156e-06, + "loss": 0.7115, + "step": 4293 + }, + { + "epoch": 0.55, + "grad_norm": 1.2591958045959473, + "learning_rate": 4.4326667604987165e-06, + "loss": 0.571, + "step": 4294 + }, + { + "epoch": 0.55, + "grad_norm": 1.4871764183044434, + "learning_rate": 4.430605182175895e-06, + "loss": 0.5495, + "step": 4295 + }, + { + "epoch": 0.55, + "grad_norm": 1.2093896865844727, + "learning_rate": 4.428543701919758e-06, + "loss": 0.5496, + "step": 4296 + }, + { + "epoch": 0.55, + "grad_norm": 1.2225619554519653, + "learning_rate": 4.426482320085352e-06, + "loss": 0.5976, + "step": 4297 + }, + { + "epoch": 0.55, + "grad_norm": 1.4069699048995972, + "learning_rate": 4.424421037027711e-06, + "loss": 0.5725, + "step": 4298 + }, + { + "epoch": 0.55, + "grad_norm": 1.7598546743392944, + "learning_rate": 4.422359853101846e-06, + "loss": 0.5402, + "step": 4299 + }, + { + "epoch": 0.55, + "grad_norm": 1.1938350200653076, + "learning_rate": 4.420298768662756e-06, + "loss": 0.5808, + "step": 4300 + }, + { + "epoch": 0.55, + "grad_norm": 1.3357658386230469, + "learning_rate": 4.418237784065419e-06, + "loss": 0.6606, + "step": 4301 + }, + { + "epoch": 0.55, + "grad_norm": 1.4759631156921387, + "learning_rate": 4.416176899664801e-06, + "loss": 0.6102, + "step": 4302 + }, + { + "epoch": 0.55, + "grad_norm": 1.2692142724990845, + "learning_rate": 4.4141161158158426e-06, + "loss": 0.6524, + "step": 4303 + }, + { + "epoch": 0.55, + "grad_norm": 1.1206196546554565, + "learning_rate": 4.412055432873475e-06, + "loss": 0.6809, + "step": 4304 + }, + { + "epoch": 0.55, + "grad_norm": 1.9297462701797485, + "learning_rate": 4.409994851192611e-06, + "loss": 0.5622, + "step": 4305 + }, + { + "epoch": 0.55, + "grad_norm": 1.0943596363067627, + "learning_rate": 4.40793437112814e-06, + "loss": 0.5449, + "step": 4306 + }, + { + "epoch": 0.55, + "grad_norm": 1.434758186340332, + "learning_rate": 4.4058739930349406e-06, + "loss": 0.6168, + "step": 4307 + }, + { + "epoch": 0.55, + "grad_norm": 1.8183618783950806, + "learning_rate": 4.403813717267869e-06, + "loss": 0.5821, + "step": 4308 + }, + { + "epoch": 0.55, + "grad_norm": 1.0242024660110474, + "learning_rate": 4.401753544181767e-06, + "loss": 0.6293, + "step": 4309 + }, + { + "epoch": 0.55, + "grad_norm": 1.1580708026885986, + "learning_rate": 4.399693474131456e-06, + "loss": 0.6038, + "step": 4310 + }, + { + "epoch": 0.55, + "grad_norm": 1.2843397855758667, + "learning_rate": 4.3976335074717446e-06, + "loss": 0.6346, + "step": 4311 + }, + { + "epoch": 0.55, + "grad_norm": 1.1573116779327393, + "learning_rate": 4.3955736445574176e-06, + "loss": 0.6021, + "step": 4312 + }, + { + "epoch": 0.55, + "grad_norm": 1.339087963104248, + "learning_rate": 4.393513885743243e-06, + "loss": 0.5863, + "step": 4313 + }, + { + "epoch": 0.55, + "grad_norm": 1.3947805166244507, + "learning_rate": 4.391454231383976e-06, + "loss": 0.649, + "step": 4314 + }, + { + "epoch": 0.55, + "grad_norm": 1.3123061656951904, + "learning_rate": 4.389394681834348e-06, + "loss": 0.6246, + "step": 4315 + }, + { + "epoch": 0.55, + "grad_norm": 1.151753306388855, + "learning_rate": 4.387335237449076e-06, + "loss": 0.709, + "step": 4316 + }, + { + "epoch": 0.55, + "grad_norm": 1.03976571559906, + "learning_rate": 4.385275898582855e-06, + "loss": 0.549, + "step": 4317 + }, + { + "epoch": 0.55, + "grad_norm": 1.5990352630615234, + "learning_rate": 4.383216665590366e-06, + "loss": 0.7688, + "step": 4318 + }, + { + "epoch": 0.55, + "grad_norm": 1.2555058002471924, + "learning_rate": 4.381157538826269e-06, + "loss": 0.6593, + "step": 4319 + }, + { + "epoch": 0.55, + "grad_norm": 1.5851774215698242, + "learning_rate": 4.379098518645207e-06, + "loss": 0.6126, + "step": 4320 + }, + { + "epoch": 0.55, + "grad_norm": 1.2325047254562378, + "learning_rate": 4.377039605401807e-06, + "loss": 0.6097, + "step": 4321 + }, + { + "epoch": 0.55, + "grad_norm": 1.3471224308013916, + "learning_rate": 4.374980799450672e-06, + "loss": 0.5994, + "step": 4322 + }, + { + "epoch": 0.55, + "grad_norm": 2.4533209800720215, + "learning_rate": 4.372922101146391e-06, + "loss": 0.5687, + "step": 4323 + }, + { + "epoch": 0.55, + "grad_norm": 1.748429536819458, + "learning_rate": 4.370863510843531e-06, + "loss": 0.6614, + "step": 4324 + }, + { + "epoch": 0.55, + "grad_norm": 1.2647942304611206, + "learning_rate": 4.368805028896645e-06, + "loss": 0.6273, + "step": 4325 + }, + { + "epoch": 0.55, + "grad_norm": 1.4914000034332275, + "learning_rate": 4.366746655660262e-06, + "loss": 0.7031, + "step": 4326 + }, + { + "epoch": 0.55, + "grad_norm": 1.345897912979126, + "learning_rate": 4.364688391488897e-06, + "loss": 0.5652, + "step": 4327 + }, + { + "epoch": 0.55, + "grad_norm": 1.1223891973495483, + "learning_rate": 4.362630236737043e-06, + "loss": 0.6655, + "step": 4328 + }, + { + "epoch": 0.55, + "grad_norm": 1.1209384202957153, + "learning_rate": 4.360572191759176e-06, + "loss": 0.6175, + "step": 4329 + }, + { + "epoch": 0.55, + "grad_norm": 1.4959664344787598, + "learning_rate": 4.358514256909751e-06, + "loss": 0.6079, + "step": 4330 + }, + { + "epoch": 0.55, + "grad_norm": 1.6316180229187012, + "learning_rate": 4.356456432543208e-06, + "loss": 0.6025, + "step": 4331 + }, + { + "epoch": 0.55, + "grad_norm": 1.7658164501190186, + "learning_rate": 4.354398719013964e-06, + "loss": 0.5352, + "step": 4332 + }, + { + "epoch": 0.56, + "grad_norm": 1.239953875541687, + "learning_rate": 4.352341116676418e-06, + "loss": 0.6356, + "step": 4333 + }, + { + "epoch": 0.56, + "grad_norm": 1.2205899953842163, + "learning_rate": 4.350283625884949e-06, + "loss": 0.7296, + "step": 4334 + }, + { + "epoch": 0.56, + "grad_norm": 1.6176862716674805, + "learning_rate": 4.348226246993922e-06, + "loss": 0.6232, + "step": 4335 + }, + { + "epoch": 0.56, + "grad_norm": 1.2962661981582642, + "learning_rate": 4.346168980357674e-06, + "loss": 0.6093, + "step": 4336 + }, + { + "epoch": 0.56, + "grad_norm": 1.1365058422088623, + "learning_rate": 4.344111826330529e-06, + "loss": 0.6139, + "step": 4337 + }, + { + "epoch": 0.56, + "grad_norm": 1.4743257761001587, + "learning_rate": 4.342054785266792e-06, + "loss": 0.6462, + "step": 4338 + }, + { + "epoch": 0.56, + "grad_norm": 1.2239124774932861, + "learning_rate": 4.339997857520745e-06, + "loss": 0.58, + "step": 4339 + }, + { + "epoch": 0.56, + "grad_norm": 1.2178657054901123, + "learning_rate": 4.337941043446653e-06, + "loss": 0.6802, + "step": 4340 + }, + { + "epoch": 0.56, + "grad_norm": 0.9617922902107239, + "learning_rate": 4.335884343398757e-06, + "loss": 0.6315, + "step": 4341 + }, + { + "epoch": 0.56, + "grad_norm": 1.1992239952087402, + "learning_rate": 4.333827757731286e-06, + "loss": 0.6106, + "step": 4342 + }, + { + "epoch": 0.56, + "grad_norm": 1.2823426723480225, + "learning_rate": 4.331771286798442e-06, + "loss": 0.7401, + "step": 4343 + }, + { + "epoch": 0.56, + "grad_norm": 1.0556762218475342, + "learning_rate": 4.329714930954414e-06, + "loss": 0.5582, + "step": 4344 + }, + { + "epoch": 0.56, + "grad_norm": 1.2360563278198242, + "learning_rate": 4.327658690553362e-06, + "loss": 0.5358, + "step": 4345 + }, + { + "epoch": 0.56, + "grad_norm": 1.4432363510131836, + "learning_rate": 4.325602565949437e-06, + "loss": 0.6498, + "step": 4346 + }, + { + "epoch": 0.56, + "grad_norm": 1.0684760808944702, + "learning_rate": 4.3235465574967615e-06, + "loss": 0.5381, + "step": 4347 + }, + { + "epoch": 0.56, + "grad_norm": 1.2283833026885986, + "learning_rate": 4.321490665549442e-06, + "loss": 0.525, + "step": 4348 + }, + { + "epoch": 0.56, + "grad_norm": 1.2738137245178223, + "learning_rate": 4.319434890461563e-06, + "loss": 0.6444, + "step": 4349 + }, + { + "epoch": 0.56, + "grad_norm": 1.3298536539077759, + "learning_rate": 4.317379232587194e-06, + "loss": 0.6234, + "step": 4350 + }, + { + "epoch": 0.56, + "grad_norm": 1.158233642578125, + "learning_rate": 4.315323692280375e-06, + "loss": 0.5949, + "step": 4351 + }, + { + "epoch": 0.56, + "grad_norm": 3.0666298866271973, + "learning_rate": 4.313268269895134e-06, + "loss": 0.5992, + "step": 4352 + }, + { + "epoch": 0.56, + "grad_norm": 1.5501093864440918, + "learning_rate": 4.3112129657854755e-06, + "loss": 0.5746, + "step": 4353 + }, + { + "epoch": 0.56, + "grad_norm": 1.7205291986465454, + "learning_rate": 4.3091577803053816e-06, + "loss": 0.6069, + "step": 4354 + }, + { + "epoch": 0.56, + "grad_norm": 1.564469337463379, + "learning_rate": 4.3071027138088206e-06, + "loss": 0.5885, + "step": 4355 + }, + { + "epoch": 0.56, + "grad_norm": 1.302836298942566, + "learning_rate": 4.305047766649733e-06, + "loss": 0.6223, + "step": 4356 + }, + { + "epoch": 0.56, + "grad_norm": 1.3741014003753662, + "learning_rate": 4.302992939182042e-06, + "loss": 0.5507, + "step": 4357 + }, + { + "epoch": 0.56, + "grad_norm": 1.358909249305725, + "learning_rate": 4.30093823175965e-06, + "loss": 0.6304, + "step": 4358 + }, + { + "epoch": 0.56, + "grad_norm": 1.1862109899520874, + "learning_rate": 4.298883644736438e-06, + "loss": 0.7494, + "step": 4359 + }, + { + "epoch": 0.56, + "grad_norm": 1.1762068271636963, + "learning_rate": 4.296829178466268e-06, + "loss": 0.6076, + "step": 4360 + }, + { + "epoch": 0.56, + "grad_norm": 1.2821547985076904, + "learning_rate": 4.294774833302981e-06, + "loss": 0.627, + "step": 4361 + }, + { + "epoch": 0.56, + "grad_norm": 1.2723315954208374, + "learning_rate": 4.292720609600393e-06, + "loss": 0.6285, + "step": 4362 + }, + { + "epoch": 0.56, + "grad_norm": 1.2723183631896973, + "learning_rate": 4.290666507712304e-06, + "loss": 0.6313, + "step": 4363 + }, + { + "epoch": 0.56, + "grad_norm": 1.1899631023406982, + "learning_rate": 4.288612527992492e-06, + "loss": 0.6145, + "step": 4364 + }, + { + "epoch": 0.56, + "grad_norm": 1.3209925889968872, + "learning_rate": 4.286558670794712e-06, + "loss": 0.5391, + "step": 4365 + }, + { + "epoch": 0.56, + "grad_norm": 1.0585249662399292, + "learning_rate": 4.284504936472701e-06, + "loss": 0.5716, + "step": 4366 + }, + { + "epoch": 0.56, + "grad_norm": 1.8191659450531006, + "learning_rate": 4.28245132538017e-06, + "loss": 0.5629, + "step": 4367 + }, + { + "epoch": 0.56, + "grad_norm": 1.2652943134307861, + "learning_rate": 4.2803978378708145e-06, + "loss": 0.6255, + "step": 4368 + }, + { + "epoch": 0.56, + "grad_norm": 1.146340250968933, + "learning_rate": 4.278344474298304e-06, + "loss": 0.726, + "step": 4369 + }, + { + "epoch": 0.56, + "grad_norm": 1.114557147026062, + "learning_rate": 4.276291235016291e-06, + "loss": 0.6706, + "step": 4370 + }, + { + "epoch": 0.56, + "grad_norm": 1.4655970335006714, + "learning_rate": 4.274238120378401e-06, + "loss": 0.5154, + "step": 4371 + }, + { + "epoch": 0.56, + "grad_norm": 1.245018720626831, + "learning_rate": 4.272185130738243e-06, + "loss": 0.5518, + "step": 4372 + }, + { + "epoch": 0.56, + "grad_norm": 1.4143431186676025, + "learning_rate": 4.270132266449404e-06, + "loss": 0.589, + "step": 4373 + }, + { + "epoch": 0.56, + "grad_norm": 1.2347571849822998, + "learning_rate": 4.268079527865447e-06, + "loss": 0.712, + "step": 4374 + }, + { + "epoch": 0.56, + "grad_norm": 1.2976598739624023, + "learning_rate": 4.266026915339915e-06, + "loss": 0.5632, + "step": 4375 + }, + { + "epoch": 0.56, + "grad_norm": 1.4560409784317017, + "learning_rate": 4.263974429226327e-06, + "loss": 0.6189, + "step": 4376 + }, + { + "epoch": 0.56, + "grad_norm": 1.3681960105895996, + "learning_rate": 4.261922069878185e-06, + "loss": 0.4996, + "step": 4377 + }, + { + "epoch": 0.56, + "grad_norm": 2.385707139968872, + "learning_rate": 4.259869837648963e-06, + "loss": 0.615, + "step": 4378 + }, + { + "epoch": 0.56, + "grad_norm": 2.20070743560791, + "learning_rate": 4.2578177328921185e-06, + "loss": 0.5422, + "step": 4379 + }, + { + "epoch": 0.56, + "grad_norm": 1.4489797353744507, + "learning_rate": 4.255765755961083e-06, + "loss": 0.6436, + "step": 4380 + }, + { + "epoch": 0.56, + "grad_norm": 1.207725167274475, + "learning_rate": 4.253713907209271e-06, + "loss": 0.5272, + "step": 4381 + }, + { + "epoch": 0.56, + "grad_norm": 1.0956413745880127, + "learning_rate": 4.251662186990067e-06, + "loss": 0.5381, + "step": 4382 + }, + { + "epoch": 0.56, + "grad_norm": 1.241393804550171, + "learning_rate": 4.249610595656843e-06, + "loss": 0.6241, + "step": 4383 + }, + { + "epoch": 0.56, + "grad_norm": 1.1352612972259521, + "learning_rate": 4.24755913356294e-06, + "loss": 0.6223, + "step": 4384 + }, + { + "epoch": 0.56, + "grad_norm": 1.1019214391708374, + "learning_rate": 4.245507801061684e-06, + "loss": 0.7584, + "step": 4385 + }, + { + "epoch": 0.56, + "grad_norm": 1.3050336837768555, + "learning_rate": 4.243456598506373e-06, + "loss": 0.6316, + "step": 4386 + }, + { + "epoch": 0.56, + "grad_norm": 1.0004106760025024, + "learning_rate": 4.241405526250285e-06, + "loss": 0.5286, + "step": 4387 + }, + { + "epoch": 0.56, + "grad_norm": 1.244195818901062, + "learning_rate": 4.239354584646677e-06, + "loss": 0.5851, + "step": 4388 + }, + { + "epoch": 0.56, + "grad_norm": 1.2075914144515991, + "learning_rate": 4.2373037740487785e-06, + "loss": 0.5609, + "step": 4389 + }, + { + "epoch": 0.56, + "grad_norm": 1.2431285381317139, + "learning_rate": 4.235253094809804e-06, + "loss": 0.5163, + "step": 4390 + }, + { + "epoch": 0.56, + "grad_norm": 1.424965739250183, + "learning_rate": 4.233202547282941e-06, + "loss": 0.5798, + "step": 4391 + }, + { + "epoch": 0.56, + "grad_norm": 1.2569266557693481, + "learning_rate": 4.231152131821353e-06, + "loss": 0.6726, + "step": 4392 + }, + { + "epoch": 0.56, + "grad_norm": 1.3034515380859375, + "learning_rate": 4.2291018487781825e-06, + "loss": 0.6889, + "step": 4393 + }, + { + "epoch": 0.56, + "grad_norm": 1.1181299686431885, + "learning_rate": 4.227051698506551e-06, + "loss": 0.622, + "step": 4394 + }, + { + "epoch": 0.56, + "grad_norm": 1.356863260269165, + "learning_rate": 4.225001681359552e-06, + "loss": 0.6016, + "step": 4395 + }, + { + "epoch": 0.56, + "grad_norm": 1.473215937614441, + "learning_rate": 4.222951797690262e-06, + "loss": 0.603, + "step": 4396 + }, + { + "epoch": 0.56, + "grad_norm": 1.1657586097717285, + "learning_rate": 4.220902047851729e-06, + "loss": 0.6301, + "step": 4397 + }, + { + "epoch": 0.56, + "grad_norm": 1.2193200588226318, + "learning_rate": 4.218852432196984e-06, + "loss": 0.5981, + "step": 4398 + }, + { + "epoch": 0.56, + "grad_norm": 2.201629400253296, + "learning_rate": 4.21680295107903e-06, + "loss": 0.5681, + "step": 4399 + }, + { + "epoch": 0.56, + "grad_norm": 1.4651731252670288, + "learning_rate": 4.2147536048508485e-06, + "loss": 0.5787, + "step": 4400 + }, + { + "epoch": 0.56, + "grad_norm": 0.9583895206451416, + "learning_rate": 4.212704393865398e-06, + "loss": 0.521, + "step": 4401 + }, + { + "epoch": 0.56, + "grad_norm": 1.2025455236434937, + "learning_rate": 4.210655318475613e-06, + "loss": 0.6295, + "step": 4402 + }, + { + "epoch": 0.56, + "grad_norm": 1.3904603719711304, + "learning_rate": 4.208606379034405e-06, + "loss": 0.584, + "step": 4403 + }, + { + "epoch": 0.56, + "grad_norm": 1.3118081092834473, + "learning_rate": 4.206557575894664e-06, + "loss": 0.6188, + "step": 4404 + }, + { + "epoch": 0.56, + "grad_norm": 1.2359284162521362, + "learning_rate": 4.204508909409253e-06, + "loss": 0.5612, + "step": 4405 + }, + { + "epoch": 0.56, + "grad_norm": 1.6139267683029175, + "learning_rate": 4.202460379931009e-06, + "loss": 0.5957, + "step": 4406 + }, + { + "epoch": 0.56, + "grad_norm": 1.1521854400634766, + "learning_rate": 4.200411987812758e-06, + "loss": 0.613, + "step": 4407 + }, + { + "epoch": 0.56, + "grad_norm": 1.0886516571044922, + "learning_rate": 4.198363733407289e-06, + "loss": 0.6028, + "step": 4408 + }, + { + "epoch": 0.56, + "grad_norm": 1.284321904182434, + "learning_rate": 4.196315617067374e-06, + "loss": 0.5777, + "step": 4409 + }, + { + "epoch": 0.56, + "grad_norm": 1.2444514036178589, + "learning_rate": 4.194267639145758e-06, + "loss": 0.5497, + "step": 4410 + }, + { + "epoch": 0.57, + "grad_norm": 1.4131932258605957, + "learning_rate": 4.192219799995164e-06, + "loss": 0.6017, + "step": 4411 + }, + { + "epoch": 0.57, + "grad_norm": 2.238851308822632, + "learning_rate": 4.190172099968291e-06, + "loss": 0.5908, + "step": 4412 + }, + { + "epoch": 0.57, + "grad_norm": 1.365478754043579, + "learning_rate": 4.1881245394178125e-06, + "loss": 0.5623, + "step": 4413 + }, + { + "epoch": 0.57, + "grad_norm": 1.6913090944290161, + "learning_rate": 4.186077118696381e-06, + "loss": 0.6189, + "step": 4414 + }, + { + "epoch": 0.57, + "grad_norm": 1.0305039882659912, + "learning_rate": 4.184029838156622e-06, + "loss": 0.6424, + "step": 4415 + }, + { + "epoch": 0.57, + "grad_norm": 0.9532804489135742, + "learning_rate": 4.181982698151138e-06, + "loss": 0.6773, + "step": 4416 + }, + { + "epoch": 0.57, + "grad_norm": 1.6363182067871094, + "learning_rate": 4.179935699032507e-06, + "loss": 0.5869, + "step": 4417 + }, + { + "epoch": 0.57, + "grad_norm": 1.2440401315689087, + "learning_rate": 4.177888841153285e-06, + "loss": 0.6201, + "step": 4418 + }, + { + "epoch": 0.57, + "grad_norm": 1.4306551218032837, + "learning_rate": 4.175842124865998e-06, + "loss": 0.6048, + "step": 4419 + }, + { + "epoch": 0.57, + "grad_norm": 1.1270860433578491, + "learning_rate": 4.1737955505231546e-06, + "loss": 0.6119, + "step": 4420 + }, + { + "epoch": 0.57, + "grad_norm": 1.3283473253250122, + "learning_rate": 4.171749118477234e-06, + "loss": 0.549, + "step": 4421 + }, + { + "epoch": 0.57, + "grad_norm": 1.4524391889572144, + "learning_rate": 4.1697028290806935e-06, + "loss": 0.5849, + "step": 4422 + }, + { + "epoch": 0.57, + "grad_norm": 1.351607084274292, + "learning_rate": 4.167656682685962e-06, + "loss": 0.6161, + "step": 4423 + }, + { + "epoch": 0.57, + "grad_norm": 1.2113388776779175, + "learning_rate": 4.165610679645451e-06, + "loss": 0.5219, + "step": 4424 + }, + { + "epoch": 0.57, + "grad_norm": 2.863964557647705, + "learning_rate": 4.1635648203115405e-06, + "loss": 0.5498, + "step": 4425 + }, + { + "epoch": 0.57, + "grad_norm": 1.9458630084991455, + "learning_rate": 4.161519105036588e-06, + "loss": 0.4855, + "step": 4426 + }, + { + "epoch": 0.57, + "grad_norm": 1.1281429529190063, + "learning_rate": 4.159473534172927e-06, + "loss": 0.5676, + "step": 4427 + }, + { + "epoch": 0.57, + "grad_norm": 1.1383236646652222, + "learning_rate": 4.157428108072866e-06, + "loss": 0.6055, + "step": 4428 + }, + { + "epoch": 0.57, + "grad_norm": 1.510728120803833, + "learning_rate": 4.155382827088688e-06, + "loss": 0.5496, + "step": 4429 + }, + { + "epoch": 0.57, + "grad_norm": 1.6128290891647339, + "learning_rate": 4.1533376915726495e-06, + "loss": 0.5808, + "step": 4430 + }, + { + "epoch": 0.57, + "grad_norm": 2.0385468006134033, + "learning_rate": 4.151292701876986e-06, + "loss": 0.6876, + "step": 4431 + }, + { + "epoch": 0.57, + "grad_norm": 1.1657432317733765, + "learning_rate": 4.149247858353902e-06, + "loss": 0.5627, + "step": 4432 + }, + { + "epoch": 0.57, + "grad_norm": 1.0777486562728882, + "learning_rate": 4.147203161355583e-06, + "loss": 0.5524, + "step": 4433 + }, + { + "epoch": 0.57, + "grad_norm": 1.4103708267211914, + "learning_rate": 4.145158611234186e-06, + "loss": 0.5984, + "step": 4434 + }, + { + "epoch": 0.57, + "grad_norm": 1.323095679283142, + "learning_rate": 4.143114208341843e-06, + "loss": 0.7533, + "step": 4435 + }, + { + "epoch": 0.57, + "grad_norm": 1.0617965459823608, + "learning_rate": 4.14106995303066e-06, + "loss": 0.5806, + "step": 4436 + }, + { + "epoch": 0.57, + "grad_norm": 1.3179047107696533, + "learning_rate": 4.1390258456527195e-06, + "loss": 0.5494, + "step": 4437 + }, + { + "epoch": 0.57, + "grad_norm": 3.672276020050049, + "learning_rate": 4.136981886560078e-06, + "loss": 0.5966, + "step": 4438 + }, + { + "epoch": 0.57, + "grad_norm": 1.2726572751998901, + "learning_rate": 4.134938076104764e-06, + "loss": 0.6048, + "step": 4439 + }, + { + "epoch": 0.57, + "grad_norm": 1.0546799898147583, + "learning_rate": 4.132894414638782e-06, + "loss": 0.6254, + "step": 4440 + }, + { + "epoch": 0.57, + "grad_norm": 1.2751909494400024, + "learning_rate": 4.130850902514114e-06, + "loss": 0.5718, + "step": 4441 + }, + { + "epoch": 0.57, + "grad_norm": 1.2318024635314941, + "learning_rate": 4.128807540082714e-06, + "loss": 0.5496, + "step": 4442 + }, + { + "epoch": 0.57, + "grad_norm": 1.266778826713562, + "learning_rate": 4.126764327696504e-06, + "loss": 0.5568, + "step": 4443 + }, + { + "epoch": 0.57, + "grad_norm": 1.3783042430877686, + "learning_rate": 4.124721265707392e-06, + "loss": 0.5641, + "step": 4444 + }, + { + "epoch": 0.57, + "grad_norm": 2.1089837551116943, + "learning_rate": 4.12267835446725e-06, + "loss": 0.6204, + "step": 4445 + }, + { + "epoch": 0.57, + "grad_norm": 3.281911611557007, + "learning_rate": 4.12063559432793e-06, + "loss": 0.7028, + "step": 4446 + }, + { + "epoch": 0.57, + "grad_norm": 1.1332265138626099, + "learning_rate": 4.118592985641254e-06, + "loss": 0.6078, + "step": 4447 + }, + { + "epoch": 0.57, + "grad_norm": 1.2196893692016602, + "learning_rate": 4.116550528759023e-06, + "loss": 0.5784, + "step": 4448 + }, + { + "epoch": 0.57, + "grad_norm": 1.2832058668136597, + "learning_rate": 4.114508224033004e-06, + "loss": 0.6051, + "step": 4449 + }, + { + "epoch": 0.57, + "grad_norm": 1.4253997802734375, + "learning_rate": 4.112466071814947e-06, + "loss": 0.6712, + "step": 4450 + }, + { + "epoch": 0.57, + "grad_norm": 1.711785912513733, + "learning_rate": 4.110424072456568e-06, + "loss": 0.6563, + "step": 4451 + }, + { + "epoch": 0.57, + "grad_norm": 1.2868982553482056, + "learning_rate": 4.108382226309563e-06, + "loss": 0.5904, + "step": 4452 + }, + { + "epoch": 0.57, + "grad_norm": 1.2525250911712646, + "learning_rate": 4.106340533725595e-06, + "loss": 0.6214, + "step": 4453 + }, + { + "epoch": 0.57, + "grad_norm": 1.175255537033081, + "learning_rate": 4.104298995056307e-06, + "loss": 0.5653, + "step": 4454 + }, + { + "epoch": 0.57, + "grad_norm": 1.409961223602295, + "learning_rate": 4.102257610653311e-06, + "loss": 0.605, + "step": 4455 + }, + { + "epoch": 0.57, + "grad_norm": 1.28078293800354, + "learning_rate": 4.100216380868194e-06, + "loss": 0.5668, + "step": 4456 + }, + { + "epoch": 0.57, + "grad_norm": 1.1241258382797241, + "learning_rate": 4.098175306052515e-06, + "loss": 0.6365, + "step": 4457 + }, + { + "epoch": 0.57, + "grad_norm": 1.2419236898422241, + "learning_rate": 4.09613438655781e-06, + "loss": 0.6215, + "step": 4458 + }, + { + "epoch": 0.57, + "grad_norm": 1.4537636041641235, + "learning_rate": 4.0940936227355866e-06, + "loss": 0.5763, + "step": 4459 + }, + { + "epoch": 0.57, + "grad_norm": 1.1015124320983887, + "learning_rate": 4.0920530149373235e-06, + "loss": 0.5656, + "step": 4460 + }, + { + "epoch": 0.57, + "grad_norm": 1.080503225326538, + "learning_rate": 4.090012563514473e-06, + "loss": 0.606, + "step": 4461 + }, + { + "epoch": 0.57, + "grad_norm": 1.4011242389678955, + "learning_rate": 4.087972268818463e-06, + "loss": 0.6446, + "step": 4462 + }, + { + "epoch": 0.57, + "grad_norm": 1.2129254341125488, + "learning_rate": 4.085932131200691e-06, + "loss": 0.5713, + "step": 4463 + }, + { + "epoch": 0.57, + "grad_norm": 1.1942564249038696, + "learning_rate": 4.083892151012531e-06, + "loss": 0.6235, + "step": 4464 + }, + { + "epoch": 0.57, + "grad_norm": 1.2527570724487305, + "learning_rate": 4.081852328605327e-06, + "loss": 0.5275, + "step": 4465 + }, + { + "epoch": 0.57, + "grad_norm": 1.1266459226608276, + "learning_rate": 4.079812664330398e-06, + "loss": 0.6578, + "step": 4466 + }, + { + "epoch": 0.57, + "grad_norm": 1.1888070106506348, + "learning_rate": 4.0777731585390335e-06, + "loss": 0.5591, + "step": 4467 + }, + { + "epoch": 0.57, + "grad_norm": 1.3004099130630493, + "learning_rate": 4.0757338115824975e-06, + "loss": 0.5872, + "step": 4468 + }, + { + "epoch": 0.57, + "grad_norm": 3.164547920227051, + "learning_rate": 4.073694623812026e-06, + "loss": 0.5892, + "step": 4469 + }, + { + "epoch": 0.57, + "grad_norm": 1.132218599319458, + "learning_rate": 4.071655595578829e-06, + "loss": 0.5648, + "step": 4470 + }, + { + "epoch": 0.57, + "grad_norm": 1.187025547027588, + "learning_rate": 4.0696167272340845e-06, + "loss": 0.5044, + "step": 4471 + }, + { + "epoch": 0.57, + "grad_norm": 1.381439208984375, + "learning_rate": 4.0675780191289496e-06, + "loss": 0.6377, + "step": 4472 + }, + { + "epoch": 0.57, + "grad_norm": 1.3431545495986938, + "learning_rate": 4.065539471614547e-06, + "loss": 0.5823, + "step": 4473 + }, + { + "epoch": 0.57, + "grad_norm": 1.775613784790039, + "learning_rate": 4.063501085041976e-06, + "loss": 0.5882, + "step": 4474 + }, + { + "epoch": 0.57, + "grad_norm": 1.6177005767822266, + "learning_rate": 4.06146285976231e-06, + "loss": 0.6739, + "step": 4475 + }, + { + "epoch": 0.57, + "grad_norm": 1.1213589906692505, + "learning_rate": 4.059424796126589e-06, + "loss": 0.6788, + "step": 4476 + }, + { + "epoch": 0.57, + "grad_norm": 1.4037939310073853, + "learning_rate": 4.0573868944858306e-06, + "loss": 0.5024, + "step": 4477 + }, + { + "epoch": 0.57, + "grad_norm": 1.1332634687423706, + "learning_rate": 4.055349155191018e-06, + "loss": 0.7279, + "step": 4478 + }, + { + "epoch": 0.57, + "grad_norm": 1.2475149631500244, + "learning_rate": 4.0533115785931146e-06, + "loss": 0.6189, + "step": 4479 + }, + { + "epoch": 0.57, + "grad_norm": 1.490715742111206, + "learning_rate": 4.051274165043049e-06, + "loss": 0.6506, + "step": 4480 + }, + { + "epoch": 0.57, + "grad_norm": 1.3428494930267334, + "learning_rate": 4.049236914891726e-06, + "loss": 0.56, + "step": 4481 + }, + { + "epoch": 0.57, + "grad_norm": 1.3903584480285645, + "learning_rate": 4.047199828490017e-06, + "loss": 0.5654, + "step": 4482 + }, + { + "epoch": 0.57, + "grad_norm": 1.2388803958892822, + "learning_rate": 4.045162906188773e-06, + "loss": 0.5477, + "step": 4483 + }, + { + "epoch": 0.57, + "grad_norm": 1.5961958169937134, + "learning_rate": 4.04312614833881e-06, + "loss": 0.574, + "step": 4484 + }, + { + "epoch": 0.57, + "grad_norm": 1.4322210550308228, + "learning_rate": 4.041089555290919e-06, + "loss": 0.5648, + "step": 4485 + }, + { + "epoch": 0.57, + "grad_norm": 1.3006025552749634, + "learning_rate": 4.039053127395861e-06, + "loss": 0.6208, + "step": 4486 + }, + { + "epoch": 0.57, + "grad_norm": 1.0778475999832153, + "learning_rate": 4.037016865004371e-06, + "loss": 0.5335, + "step": 4487 + }, + { + "epoch": 0.57, + "grad_norm": 1.3122655153274536, + "learning_rate": 4.0349807684671506e-06, + "loss": 0.606, + "step": 4488 + }, + { + "epoch": 0.58, + "grad_norm": 1.462798833847046, + "learning_rate": 4.03294483813488e-06, + "loss": 0.6424, + "step": 4489 + }, + { + "epoch": 0.58, + "grad_norm": 1.512628436088562, + "learning_rate": 4.030909074358204e-06, + "loss": 0.5838, + "step": 4490 + }, + { + "epoch": 0.58, + "grad_norm": 1.626233696937561, + "learning_rate": 4.028873477487741e-06, + "loss": 0.6137, + "step": 4491 + }, + { + "epoch": 0.58, + "grad_norm": 1.5814064741134644, + "learning_rate": 4.026838047874084e-06, + "loss": 0.6584, + "step": 4492 + }, + { + "epoch": 0.58, + "grad_norm": 1.6172866821289062, + "learning_rate": 4.024802785867793e-06, + "loss": 0.6242, + "step": 4493 + }, + { + "epoch": 0.58, + "grad_norm": 1.1439348459243774, + "learning_rate": 4.0227676918194015e-06, + "loss": 0.7038, + "step": 4494 + }, + { + "epoch": 0.58, + "grad_norm": 1.3968899250030518, + "learning_rate": 4.020732766079411e-06, + "loss": 0.6014, + "step": 4495 + }, + { + "epoch": 0.58, + "grad_norm": 1.4990570545196533, + "learning_rate": 4.018698008998298e-06, + "loss": 0.623, + "step": 4496 + }, + { + "epoch": 0.58, + "grad_norm": 1.8007217645645142, + "learning_rate": 4.0166634209265065e-06, + "loss": 0.6442, + "step": 4497 + }, + { + "epoch": 0.58, + "grad_norm": 1.3919003009796143, + "learning_rate": 4.014629002214454e-06, + "loss": 0.46, + "step": 4498 + }, + { + "epoch": 0.58, + "grad_norm": 1.3041036128997803, + "learning_rate": 4.012594753212528e-06, + "loss": 0.6084, + "step": 4499 + }, + { + "epoch": 0.58, + "grad_norm": 1.221053123474121, + "learning_rate": 4.010560674271085e-06, + "loss": 0.5738, + "step": 4500 + }, + { + "epoch": 0.58, + "grad_norm": 1.5155267715454102, + "learning_rate": 4.0085267657404544e-06, + "loss": 0.7307, + "step": 4501 + }, + { + "epoch": 0.58, + "grad_norm": 1.1297587156295776, + "learning_rate": 4.006493027970938e-06, + "loss": 0.6133, + "step": 4502 + }, + { + "epoch": 0.58, + "grad_norm": 1.395195484161377, + "learning_rate": 4.004459461312802e-06, + "loss": 0.7651, + "step": 4503 + }, + { + "epoch": 0.58, + "grad_norm": 1.0992610454559326, + "learning_rate": 4.0024260661162895e-06, + "loss": 0.4897, + "step": 4504 + }, + { + "epoch": 0.58, + "grad_norm": 1.1246989965438843, + "learning_rate": 4.000392842731611e-06, + "loss": 0.5924, + "step": 4505 + }, + { + "epoch": 0.58, + "grad_norm": 1.2189942598342896, + "learning_rate": 3.998359791508946e-06, + "loss": 0.6192, + "step": 4506 + }, + { + "epoch": 0.58, + "grad_norm": 1.5760741233825684, + "learning_rate": 3.996326912798449e-06, + "loss": 0.6763, + "step": 4507 + }, + { + "epoch": 0.58, + "grad_norm": 1.6541393995285034, + "learning_rate": 3.994294206950241e-06, + "loss": 0.6857, + "step": 4508 + }, + { + "epoch": 0.58, + "grad_norm": 1.0716215372085571, + "learning_rate": 3.992261674314411e-06, + "loss": 0.6769, + "step": 4509 + }, + { + "epoch": 0.58, + "grad_norm": 1.1231220960617065, + "learning_rate": 3.990229315241028e-06, + "loss": 0.5404, + "step": 4510 + }, + { + "epoch": 0.58, + "grad_norm": 1.6681180000305176, + "learning_rate": 3.988197130080121e-06, + "loss": 0.5339, + "step": 4511 + }, + { + "epoch": 0.58, + "grad_norm": 1.169209361076355, + "learning_rate": 3.986165119181692e-06, + "loss": 0.5946, + "step": 4512 + }, + { + "epoch": 0.58, + "grad_norm": 1.410796046257019, + "learning_rate": 3.9841332828957135e-06, + "loss": 0.6796, + "step": 4513 + }, + { + "epoch": 0.58, + "grad_norm": 1.1149036884307861, + "learning_rate": 3.98210162157213e-06, + "loss": 0.5849, + "step": 4514 + }, + { + "epoch": 0.58, + "grad_norm": 1.4105883836746216, + "learning_rate": 3.980070135560852e-06, + "loss": 0.5759, + "step": 4515 + }, + { + "epoch": 0.58, + "grad_norm": 1.4017486572265625, + "learning_rate": 3.978038825211763e-06, + "loss": 0.5776, + "step": 4516 + }, + { + "epoch": 0.58, + "grad_norm": 1.292477011680603, + "learning_rate": 3.9760076908747134e-06, + "loss": 0.7173, + "step": 4517 + }, + { + "epoch": 0.58, + "grad_norm": 1.4200750589370728, + "learning_rate": 3.973976732899526e-06, + "loss": 0.5984, + "step": 4518 + }, + { + "epoch": 0.58, + "grad_norm": 1.526354432106018, + "learning_rate": 3.971945951635992e-06, + "loss": 0.5828, + "step": 4519 + }, + { + "epoch": 0.58, + "grad_norm": 0.998432457447052, + "learning_rate": 3.969915347433871e-06, + "loss": 0.5829, + "step": 4520 + }, + { + "epoch": 0.58, + "grad_norm": 4.682458400726318, + "learning_rate": 3.967884920642895e-06, + "loss": 0.6694, + "step": 4521 + }, + { + "epoch": 0.58, + "grad_norm": 1.5824471712112427, + "learning_rate": 3.965854671612762e-06, + "loss": 0.5997, + "step": 4522 + }, + { + "epoch": 0.58, + "grad_norm": 6.456887245178223, + "learning_rate": 3.963824600693143e-06, + "loss": 0.6215, + "step": 4523 + }, + { + "epoch": 0.58, + "grad_norm": 1.255520224571228, + "learning_rate": 3.961794708233675e-06, + "loss": 0.7175, + "step": 4524 + }, + { + "epoch": 0.58, + "grad_norm": 1.1447798013687134, + "learning_rate": 3.959764994583965e-06, + "loss": 0.5813, + "step": 4525 + }, + { + "epoch": 0.58, + "grad_norm": 1.238195538520813, + "learning_rate": 3.957735460093591e-06, + "loss": 0.5689, + "step": 4526 + }, + { + "epoch": 0.58, + "grad_norm": 1.5053225755691528, + "learning_rate": 3.955706105112101e-06, + "loss": 0.591, + "step": 4527 + }, + { + "epoch": 0.58, + "grad_norm": 1.2886848449707031, + "learning_rate": 3.953676929989008e-06, + "loss": 0.5764, + "step": 4528 + }, + { + "epoch": 0.58, + "grad_norm": 1.453669786453247, + "learning_rate": 3.951647935073796e-06, + "loss": 0.6344, + "step": 4529 + }, + { + "epoch": 0.58, + "grad_norm": 1.3581910133361816, + "learning_rate": 3.949619120715918e-06, + "loss": 0.5262, + "step": 4530 + }, + { + "epoch": 0.58, + "grad_norm": 1.4336278438568115, + "learning_rate": 3.947590487264799e-06, + "loss": 0.5272, + "step": 4531 + }, + { + "epoch": 0.58, + "grad_norm": 1.4319508075714111, + "learning_rate": 3.945562035069826e-06, + "loss": 0.6207, + "step": 4532 + }, + { + "epoch": 0.58, + "grad_norm": 1.5566030740737915, + "learning_rate": 3.9435337644803605e-06, + "loss": 0.6633, + "step": 4533 + }, + { + "epoch": 0.58, + "grad_norm": 1.4970437288284302, + "learning_rate": 3.94150567584573e-06, + "loss": 0.6113, + "step": 4534 + }, + { + "epoch": 0.58, + "grad_norm": 1.2610963582992554, + "learning_rate": 3.9394777695152335e-06, + "loss": 0.6422, + "step": 4535 + }, + { + "epoch": 0.58, + "grad_norm": 1.2423081398010254, + "learning_rate": 3.937450045838133e-06, + "loss": 0.6177, + "step": 4536 + }, + { + "epoch": 0.58, + "grad_norm": 1.597859501838684, + "learning_rate": 3.935422505163667e-06, + "loss": 0.6175, + "step": 4537 + }, + { + "epoch": 0.58, + "grad_norm": 1.2870022058486938, + "learning_rate": 3.933395147841035e-06, + "loss": 0.5798, + "step": 4538 + }, + { + "epoch": 0.58, + "grad_norm": 1.2589836120605469, + "learning_rate": 3.931367974219411e-06, + "loss": 0.5974, + "step": 4539 + }, + { + "epoch": 0.58, + "grad_norm": 1.2614821195602417, + "learning_rate": 3.9293409846479305e-06, + "loss": 0.6749, + "step": 4540 + }, + { + "epoch": 0.58, + "grad_norm": 1.2686032056808472, + "learning_rate": 3.927314179475705e-06, + "loss": 0.5984, + "step": 4541 + }, + { + "epoch": 0.58, + "grad_norm": 1.2367477416992188, + "learning_rate": 3.925287559051808e-06, + "loss": 0.6273, + "step": 4542 + }, + { + "epoch": 0.58, + "grad_norm": 1.1769745349884033, + "learning_rate": 3.923261123725283e-06, + "loss": 0.5428, + "step": 4543 + }, + { + "epoch": 0.58, + "grad_norm": 1.3887510299682617, + "learning_rate": 3.921234873845146e-06, + "loss": 0.5769, + "step": 4544 + }, + { + "epoch": 0.58, + "grad_norm": 1.007226586341858, + "learning_rate": 3.9192088097603745e-06, + "loss": 0.6352, + "step": 4545 + }, + { + "epoch": 0.58, + "grad_norm": 1.2268894910812378, + "learning_rate": 3.917182931819918e-06, + "loss": 0.6749, + "step": 4546 + }, + { + "epoch": 0.58, + "grad_norm": 1.3286808729171753, + "learning_rate": 3.915157240372693e-06, + "loss": 0.5088, + "step": 4547 + }, + { + "epoch": 0.58, + "grad_norm": 1.3561018705368042, + "learning_rate": 3.913131735767583e-06, + "loss": 0.5868, + "step": 4548 + }, + { + "epoch": 0.58, + "grad_norm": 1.3705759048461914, + "learning_rate": 3.911106418353439e-06, + "loss": 0.6115, + "step": 4549 + }, + { + "epoch": 0.58, + "grad_norm": 1.358817458152771, + "learning_rate": 3.909081288479083e-06, + "loss": 0.5725, + "step": 4550 + }, + { + "epoch": 0.58, + "grad_norm": 1.503664493560791, + "learning_rate": 3.907056346493301e-06, + "loss": 0.6032, + "step": 4551 + }, + { + "epoch": 0.58, + "grad_norm": 1.1593071222305298, + "learning_rate": 3.905031592744849e-06, + "loss": 0.629, + "step": 4552 + }, + { + "epoch": 0.58, + "grad_norm": 0.9405057430267334, + "learning_rate": 3.9030070275824486e-06, + "loss": 0.5977, + "step": 4553 + }, + { + "epoch": 0.58, + "grad_norm": 1.4616215229034424, + "learning_rate": 3.90098265135479e-06, + "loss": 0.6383, + "step": 4554 + }, + { + "epoch": 0.58, + "grad_norm": 1.344404697418213, + "learning_rate": 3.898958464410532e-06, + "loss": 0.6543, + "step": 4555 + }, + { + "epoch": 0.58, + "grad_norm": 1.2103294134140015, + "learning_rate": 3.896934467098298e-06, + "loss": 0.598, + "step": 4556 + }, + { + "epoch": 0.58, + "grad_norm": 1.1870449781417847, + "learning_rate": 3.894910659766682e-06, + "loss": 0.6187, + "step": 4557 + }, + { + "epoch": 0.58, + "grad_norm": 2.1117770671844482, + "learning_rate": 3.892887042764243e-06, + "loss": 0.6356, + "step": 4558 + }, + { + "epoch": 0.58, + "grad_norm": 1.8922291994094849, + "learning_rate": 3.890863616439509e-06, + "loss": 0.5096, + "step": 4559 + }, + { + "epoch": 0.58, + "grad_norm": 1.3100972175598145, + "learning_rate": 3.888840381140971e-06, + "loss": 0.6282, + "step": 4560 + }, + { + "epoch": 0.58, + "grad_norm": 1.8961695432662964, + "learning_rate": 3.886817337217092e-06, + "loss": 0.5745, + "step": 4561 + }, + { + "epoch": 0.58, + "grad_norm": 1.7968751192092896, + "learning_rate": 3.884794485016302e-06, + "loss": 0.4665, + "step": 4562 + }, + { + "epoch": 0.58, + "grad_norm": 1.0768951177597046, + "learning_rate": 3.882771824886994e-06, + "loss": 0.583, + "step": 4563 + }, + { + "epoch": 0.58, + "grad_norm": 1.303018569946289, + "learning_rate": 3.8807493571775315e-06, + "loss": 0.5878, + "step": 4564 + }, + { + "epoch": 0.58, + "grad_norm": 1.1549627780914307, + "learning_rate": 3.878727082236241e-06, + "loss": 0.6445, + "step": 4565 + }, + { + "epoch": 0.58, + "grad_norm": 1.1280014514923096, + "learning_rate": 3.876705000411422e-06, + "loss": 0.627, + "step": 4566 + }, + { + "epoch": 0.59, + "grad_norm": 1.0545580387115479, + "learning_rate": 3.874683112051333e-06, + "loss": 0.6427, + "step": 4567 + }, + { + "epoch": 0.59, + "grad_norm": 1.2090257406234741, + "learning_rate": 3.872661417504207e-06, + "loss": 0.7049, + "step": 4568 + }, + { + "epoch": 0.59, + "grad_norm": 1.2235411405563354, + "learning_rate": 3.870639917118235e-06, + "loss": 0.7095, + "step": 4569 + }, + { + "epoch": 0.59, + "grad_norm": 1.0587208271026611, + "learning_rate": 3.868618611241584e-06, + "loss": 0.5291, + "step": 4570 + }, + { + "epoch": 0.59, + "grad_norm": 1.2914259433746338, + "learning_rate": 3.866597500222381e-06, + "loss": 0.5862, + "step": 4571 + }, + { + "epoch": 0.59, + "grad_norm": 1.3580983877182007, + "learning_rate": 3.864576584408722e-06, + "loss": 0.515, + "step": 4572 + }, + { + "epoch": 0.59, + "grad_norm": 1.3612672090530396, + "learning_rate": 3.862555864148666e-06, + "loss": 0.6386, + "step": 4573 + }, + { + "epoch": 0.59, + "grad_norm": 1.3382236957550049, + "learning_rate": 3.860535339790245e-06, + "loss": 0.5627, + "step": 4574 + }, + { + "epoch": 0.59, + "grad_norm": 1.3669838905334473, + "learning_rate": 3.85851501168145e-06, + "loss": 0.6867, + "step": 4575 + }, + { + "epoch": 0.59, + "grad_norm": 1.110496997833252, + "learning_rate": 3.856494880170243e-06, + "loss": 0.5651, + "step": 4576 + }, + { + "epoch": 0.59, + "grad_norm": 1.1593047380447388, + "learning_rate": 3.854474945604549e-06, + "loss": 0.4934, + "step": 4577 + }, + { + "epoch": 0.59, + "grad_norm": 1.3233683109283447, + "learning_rate": 3.852455208332262e-06, + "loss": 0.6082, + "step": 4578 + }, + { + "epoch": 0.59, + "grad_norm": 1.193565011024475, + "learning_rate": 3.850435668701243e-06, + "loss": 0.531, + "step": 4579 + }, + { + "epoch": 0.59, + "grad_norm": 1.1400476694107056, + "learning_rate": 3.8484163270593125e-06, + "loss": 0.5821, + "step": 4580 + }, + { + "epoch": 0.59, + "grad_norm": 1.239013910293579, + "learning_rate": 3.846397183754265e-06, + "loss": 0.5321, + "step": 4581 + }, + { + "epoch": 0.59, + "grad_norm": 1.63629150390625, + "learning_rate": 3.844378239133852e-06, + "loss": 0.5889, + "step": 4582 + }, + { + "epoch": 0.59, + "grad_norm": 1.2488521337509155, + "learning_rate": 3.8423594935458e-06, + "loss": 0.6494, + "step": 4583 + }, + { + "epoch": 0.59, + "grad_norm": 1.3007025718688965, + "learning_rate": 3.840340947337795e-06, + "loss": 0.6275, + "step": 4584 + }, + { + "epoch": 0.59, + "grad_norm": 1.5868581533432007, + "learning_rate": 3.838322600857491e-06, + "loss": 0.6701, + "step": 4585 + }, + { + "epoch": 0.59, + "grad_norm": 0.9399994015693665, + "learning_rate": 3.8363044544525065e-06, + "loss": 0.5922, + "step": 4586 + }, + { + "epoch": 0.59, + "grad_norm": 1.3345903158187866, + "learning_rate": 3.834286508470428e-06, + "loss": 0.6308, + "step": 4587 + }, + { + "epoch": 0.59, + "grad_norm": 1.9581162929534912, + "learning_rate": 3.832268763258803e-06, + "loss": 0.6132, + "step": 4588 + }, + { + "epoch": 0.59, + "grad_norm": 1.2127094268798828, + "learning_rate": 3.83025121916515e-06, + "loss": 0.6929, + "step": 4589 + }, + { + "epoch": 0.59, + "grad_norm": 1.5069866180419922, + "learning_rate": 3.8282338765369466e-06, + "loss": 0.5396, + "step": 4590 + }, + { + "epoch": 0.59, + "grad_norm": 1.1287541389465332, + "learning_rate": 3.8262167357216426e-06, + "loss": 0.6039, + "step": 4591 + }, + { + "epoch": 0.59, + "grad_norm": 1.0839670896530151, + "learning_rate": 3.824199797066646e-06, + "loss": 0.6734, + "step": 4592 + }, + { + "epoch": 0.59, + "grad_norm": 1.1110575199127197, + "learning_rate": 3.822183060919337e-06, + "loss": 0.6552, + "step": 4593 + }, + { + "epoch": 0.59, + "grad_norm": 1.0983967781066895, + "learning_rate": 3.820166527627054e-06, + "loss": 0.5722, + "step": 4594 + }, + { + "epoch": 0.59, + "grad_norm": 1.3963110446929932, + "learning_rate": 3.818150197537106e-06, + "loss": 0.6369, + "step": 4595 + }, + { + "epoch": 0.59, + "grad_norm": 1.2948497533798218, + "learning_rate": 3.816134070996766e-06, + "loss": 0.5612, + "step": 4596 + }, + { + "epoch": 0.59, + "grad_norm": 1.1768401861190796, + "learning_rate": 3.8141181483532676e-06, + "loss": 0.586, + "step": 4597 + }, + { + "epoch": 0.59, + "grad_norm": 1.5991896390914917, + "learning_rate": 3.8121024299538156e-06, + "loss": 0.6508, + "step": 4598 + }, + { + "epoch": 0.59, + "grad_norm": 1.1931565999984741, + "learning_rate": 3.8100869161455746e-06, + "loss": 0.5943, + "step": 4599 + }, + { + "epoch": 0.59, + "grad_norm": 1.2148854732513428, + "learning_rate": 3.8080716072756767e-06, + "loss": 0.648, + "step": 4600 + }, + { + "epoch": 0.59, + "grad_norm": 1.4304548501968384, + "learning_rate": 3.8060565036912167e-06, + "loss": 0.674, + "step": 4601 + }, + { + "epoch": 0.59, + "grad_norm": 1.529685616493225, + "learning_rate": 3.8040416057392577e-06, + "loss": 0.5341, + "step": 4602 + }, + { + "epoch": 0.59, + "grad_norm": 1.5385857820510864, + "learning_rate": 3.802026913766823e-06, + "loss": 0.678, + "step": 4603 + }, + { + "epoch": 0.59, + "grad_norm": 1.180269479751587, + "learning_rate": 3.8000124281209015e-06, + "loss": 0.5744, + "step": 4604 + }, + { + "epoch": 0.59, + "grad_norm": 1.1694934368133545, + "learning_rate": 3.7979981491484496e-06, + "loss": 0.5707, + "step": 4605 + }, + { + "epoch": 0.59, + "grad_norm": 1.2115732431411743, + "learning_rate": 3.795984077196384e-06, + "loss": 0.5016, + "step": 4606 + }, + { + "epoch": 0.59, + "grad_norm": 1.5277433395385742, + "learning_rate": 3.7939702126115895e-06, + "loss": 0.6791, + "step": 4607 + }, + { + "epoch": 0.59, + "grad_norm": 1.4278790950775146, + "learning_rate": 3.7919565557409115e-06, + "loss": 0.6131, + "step": 4608 + }, + { + "epoch": 0.59, + "grad_norm": 1.4480127096176147, + "learning_rate": 3.789943106931164e-06, + "loss": 0.532, + "step": 4609 + }, + { + "epoch": 0.59, + "grad_norm": 1.3539552688598633, + "learning_rate": 3.7879298665291194e-06, + "loss": 0.5845, + "step": 4610 + }, + { + "epoch": 0.59, + "grad_norm": 2.717895030975342, + "learning_rate": 3.7859168348815177e-06, + "loss": 0.5879, + "step": 4611 + }, + { + "epoch": 0.59, + "grad_norm": 1.146493911743164, + "learning_rate": 3.7839040123350664e-06, + "loss": 0.7074, + "step": 4612 + }, + { + "epoch": 0.59, + "grad_norm": 1.0468714237213135, + "learning_rate": 3.7818913992364298e-06, + "loss": 0.4987, + "step": 4613 + }, + { + "epoch": 0.59, + "grad_norm": 1.3573884963989258, + "learning_rate": 3.7798789959322417e-06, + "loss": 0.7328, + "step": 4614 + }, + { + "epoch": 0.59, + "grad_norm": 4.123680591583252, + "learning_rate": 3.7778668027690957e-06, + "loss": 0.603, + "step": 4615 + }, + { + "epoch": 0.59, + "grad_norm": 1.4465726613998413, + "learning_rate": 3.7758548200935537e-06, + "loss": 0.5605, + "step": 4616 + }, + { + "epoch": 0.59, + "grad_norm": 1.3835780620574951, + "learning_rate": 3.7738430482521355e-06, + "loss": 0.6137, + "step": 4617 + }, + { + "epoch": 0.59, + "grad_norm": 1.2412621974945068, + "learning_rate": 3.771831487591331e-06, + "loss": 0.5755, + "step": 4618 + }, + { + "epoch": 0.59, + "grad_norm": 1.3044785261154175, + "learning_rate": 3.7698201384575883e-06, + "loss": 0.6081, + "step": 4619 + }, + { + "epoch": 0.59, + "grad_norm": 1.197967290878296, + "learning_rate": 3.767809001197323e-06, + "loss": 0.6496, + "step": 4620 + }, + { + "epoch": 0.59, + "grad_norm": 1.3317675590515137, + "learning_rate": 3.7657980761569114e-06, + "loss": 0.6448, + "step": 4621 + }, + { + "epoch": 0.59, + "grad_norm": 1.4491106271743774, + "learning_rate": 3.763787363682696e-06, + "loss": 0.7453, + "step": 4622 + }, + { + "epoch": 0.59, + "grad_norm": 1.172399640083313, + "learning_rate": 3.7617768641209797e-06, + "loss": 0.5975, + "step": 4623 + }, + { + "epoch": 0.59, + "grad_norm": 1.3938887119293213, + "learning_rate": 3.7597665778180307e-06, + "loss": 0.6943, + "step": 4624 + }, + { + "epoch": 0.59, + "grad_norm": 1.1383371353149414, + "learning_rate": 3.75775650512008e-06, + "loss": 0.5426, + "step": 4625 + }, + { + "epoch": 0.59, + "grad_norm": 1.595054268836975, + "learning_rate": 3.755746646373322e-06, + "loss": 0.6339, + "step": 4626 + }, + { + "epoch": 0.59, + "grad_norm": 1.3873826265335083, + "learning_rate": 3.7537370019239135e-06, + "loss": 0.597, + "step": 4627 + }, + { + "epoch": 0.59, + "grad_norm": 1.1629124879837036, + "learning_rate": 3.7517275721179736e-06, + "loss": 0.6172, + "step": 4628 + }, + { + "epoch": 0.59, + "grad_norm": 1.3999043703079224, + "learning_rate": 3.7497183573015893e-06, + "loss": 0.5355, + "step": 4629 + }, + { + "epoch": 0.59, + "grad_norm": 1.1866925954818726, + "learning_rate": 3.7477093578208047e-06, + "loss": 0.5889, + "step": 4630 + }, + { + "epoch": 0.59, + "grad_norm": 1.3769731521606445, + "learning_rate": 3.74570057402163e-06, + "loss": 0.5985, + "step": 4631 + }, + { + "epoch": 0.59, + "grad_norm": 1.4890729188919067, + "learning_rate": 3.743692006250036e-06, + "loss": 0.5906, + "step": 4632 + }, + { + "epoch": 0.59, + "grad_norm": 1.5705612897872925, + "learning_rate": 3.741683654851959e-06, + "loss": 0.6187, + "step": 4633 + }, + { + "epoch": 0.59, + "grad_norm": 1.6368554830551147, + "learning_rate": 3.739675520173296e-06, + "loss": 0.6418, + "step": 4634 + }, + { + "epoch": 0.59, + "grad_norm": 1.1601102352142334, + "learning_rate": 3.737667602559908e-06, + "loss": 0.5984, + "step": 4635 + }, + { + "epoch": 0.59, + "grad_norm": 2.6461944580078125, + "learning_rate": 3.7356599023576166e-06, + "loss": 0.5892, + "step": 4636 + }, + { + "epoch": 0.59, + "grad_norm": 1.2127076387405396, + "learning_rate": 3.7336524199122094e-06, + "loss": 0.5839, + "step": 4637 + }, + { + "epoch": 0.59, + "grad_norm": 1.202880859375, + "learning_rate": 3.7316451555694327e-06, + "loss": 0.6829, + "step": 4638 + }, + { + "epoch": 0.59, + "grad_norm": 1.4662011861801147, + "learning_rate": 3.7296381096749983e-06, + "loss": 0.5418, + "step": 4639 + }, + { + "epoch": 0.59, + "grad_norm": 1.1767808198928833, + "learning_rate": 3.7276312825745775e-06, + "loss": 0.5559, + "step": 4640 + }, + { + "epoch": 0.59, + "grad_norm": 1.2574052810668945, + "learning_rate": 3.7256246746138082e-06, + "loss": 0.6815, + "step": 4641 + }, + { + "epoch": 0.59, + "grad_norm": 1.4866012334823608, + "learning_rate": 3.7236182861382843e-06, + "loss": 0.6487, + "step": 4642 + }, + { + "epoch": 0.59, + "grad_norm": 1.2232147455215454, + "learning_rate": 3.721612117493568e-06, + "loss": 0.6718, + "step": 4643 + }, + { + "epoch": 0.59, + "grad_norm": 1.3642147779464722, + "learning_rate": 3.7196061690251795e-06, + "loss": 0.5534, + "step": 4644 + }, + { + "epoch": 0.6, + "grad_norm": 1.121066927909851, + "learning_rate": 3.7176004410786047e-06, + "loss": 0.5487, + "step": 4645 + }, + { + "epoch": 0.6, + "grad_norm": 1.6986517906188965, + "learning_rate": 3.7155949339992856e-06, + "loss": 0.7287, + "step": 4646 + }, + { + "epoch": 0.6, + "grad_norm": 1.4121288061141968, + "learning_rate": 3.713589648132634e-06, + "loss": 0.5755, + "step": 4647 + }, + { + "epoch": 0.6, + "grad_norm": 1.3280876874923706, + "learning_rate": 3.7115845838240193e-06, + "loss": 0.5657, + "step": 4648 + }, + { + "epoch": 0.6, + "grad_norm": 1.3368446826934814, + "learning_rate": 3.7095797414187707e-06, + "loss": 0.6326, + "step": 4649 + }, + { + "epoch": 0.6, + "grad_norm": 1.0597262382507324, + "learning_rate": 3.707575121262185e-06, + "loss": 0.5701, + "step": 4650 + }, + { + "epoch": 0.6, + "grad_norm": 1.1828985214233398, + "learning_rate": 3.7055707236995123e-06, + "loss": 0.5832, + "step": 4651 + }, + { + "epoch": 0.6, + "grad_norm": 1.4886174201965332, + "learning_rate": 3.7035665490759743e-06, + "loss": 0.6071, + "step": 4652 + }, + { + "epoch": 0.6, + "grad_norm": 1.7236698865890503, + "learning_rate": 3.7015625977367476e-06, + "loss": 0.6029, + "step": 4653 + }, + { + "epoch": 0.6, + "grad_norm": 1.32156240940094, + "learning_rate": 3.6995588700269697e-06, + "loss": 0.5823, + "step": 4654 + }, + { + "epoch": 0.6, + "grad_norm": 1.207321047782898, + "learning_rate": 3.6975553662917453e-06, + "loss": 0.5992, + "step": 4655 + }, + { + "epoch": 0.6, + "grad_norm": 1.2201660871505737, + "learning_rate": 3.695552086876135e-06, + "loss": 0.4492, + "step": 4656 + }, + { + "epoch": 0.6, + "grad_norm": 1.4512239694595337, + "learning_rate": 3.6935490321251655e-06, + "loss": 0.6295, + "step": 4657 + }, + { + "epoch": 0.6, + "grad_norm": 1.6261647939682007, + "learning_rate": 3.691546202383819e-06, + "loss": 0.5875, + "step": 4658 + }, + { + "epoch": 0.6, + "grad_norm": 1.0876991748809814, + "learning_rate": 3.689543597997044e-06, + "loss": 0.5359, + "step": 4659 + }, + { + "epoch": 0.6, + "grad_norm": 1.2004252672195435, + "learning_rate": 3.687541219309748e-06, + "loss": 0.5935, + "step": 4660 + }, + { + "epoch": 0.6, + "grad_norm": 1.3604297637939453, + "learning_rate": 3.685539066666802e-06, + "loss": 0.5974, + "step": 4661 + }, + { + "epoch": 0.6, + "grad_norm": 1.2416634559631348, + "learning_rate": 3.683537140413032e-06, + "loss": 0.6314, + "step": 4662 + }, + { + "epoch": 0.6, + "grad_norm": 1.1395695209503174, + "learning_rate": 3.6815354408932314e-06, + "loss": 0.6032, + "step": 4663 + }, + { + "epoch": 0.6, + "grad_norm": 1.3960171937942505, + "learning_rate": 3.6795339684521535e-06, + "loss": 0.6214, + "step": 4664 + }, + { + "epoch": 0.6, + "grad_norm": 1.3174827098846436, + "learning_rate": 3.67753272343451e-06, + "loss": 0.6025, + "step": 4665 + }, + { + "epoch": 0.6, + "grad_norm": 1.3568823337554932, + "learning_rate": 3.675531706184975e-06, + "loss": 0.6243, + "step": 4666 + }, + { + "epoch": 0.6, + "grad_norm": 1.2531613111495972, + "learning_rate": 3.6735309170481825e-06, + "loss": 0.588, + "step": 4667 + }, + { + "epoch": 0.6, + "grad_norm": 1.1361048221588135, + "learning_rate": 3.6715303563687286e-06, + "loss": 0.6366, + "step": 4668 + }, + { + "epoch": 0.6, + "grad_norm": 1.0308409929275513, + "learning_rate": 3.6695300244911676e-06, + "loss": 0.485, + "step": 4669 + }, + { + "epoch": 0.6, + "grad_norm": 1.2207977771759033, + "learning_rate": 3.667529921760018e-06, + "loss": 0.6058, + "step": 4670 + }, + { + "epoch": 0.6, + "grad_norm": 1.139939546585083, + "learning_rate": 3.6655300485197556e-06, + "loss": 0.5526, + "step": 4671 + }, + { + "epoch": 0.6, + "grad_norm": 1.3506957292556763, + "learning_rate": 3.663530405114818e-06, + "loss": 0.6159, + "step": 4672 + }, + { + "epoch": 0.6, + "grad_norm": 1.240322232246399, + "learning_rate": 3.6615309918896034e-06, + "loss": 0.5767, + "step": 4673 + }, + { + "epoch": 0.6, + "grad_norm": 3.595827579498291, + "learning_rate": 3.6595318091884707e-06, + "loss": 0.6388, + "step": 4674 + }, + { + "epoch": 0.6, + "grad_norm": 1.5169620513916016, + "learning_rate": 3.6575328573557367e-06, + "loss": 0.6659, + "step": 4675 + }, + { + "epoch": 0.6, + "grad_norm": 1.1293915510177612, + "learning_rate": 3.655534136735682e-06, + "loss": 0.5856, + "step": 4676 + }, + { + "epoch": 0.6, + "grad_norm": 1.3506858348846436, + "learning_rate": 3.6535356476725447e-06, + "loss": 0.6416, + "step": 4677 + }, + { + "epoch": 0.6, + "grad_norm": 1.2166670560836792, + "learning_rate": 3.6515373905105254e-06, + "loss": 0.6027, + "step": 4678 + }, + { + "epoch": 0.6, + "grad_norm": 2.4438283443450928, + "learning_rate": 3.6495393655937806e-06, + "loss": 0.6248, + "step": 4679 + }, + { + "epoch": 0.6, + "grad_norm": 1.3515195846557617, + "learning_rate": 3.6475415732664297e-06, + "loss": 0.5362, + "step": 4680 + }, + { + "epoch": 0.6, + "grad_norm": 1.5036596059799194, + "learning_rate": 3.6455440138725553e-06, + "loss": 0.6032, + "step": 4681 + }, + { + "epoch": 0.6, + "grad_norm": 1.2506636381149292, + "learning_rate": 3.6435466877561933e-06, + "loss": 0.6498, + "step": 4682 + }, + { + "epoch": 0.6, + "grad_norm": 1.2424179315567017, + "learning_rate": 3.6415495952613446e-06, + "loss": 0.6529, + "step": 4683 + }, + { + "epoch": 0.6, + "grad_norm": 1.3675919771194458, + "learning_rate": 3.639552736731965e-06, + "loss": 0.5896, + "step": 4684 + }, + { + "epoch": 0.6, + "grad_norm": 1.2366334199905396, + "learning_rate": 3.6375561125119752e-06, + "loss": 0.7092, + "step": 4685 + }, + { + "epoch": 0.6, + "grad_norm": 1.2184380292892456, + "learning_rate": 3.635559722945252e-06, + "loss": 0.6131, + "step": 4686 + }, + { + "epoch": 0.6, + "grad_norm": 1.219530701637268, + "learning_rate": 3.6335635683756343e-06, + "loss": 0.5639, + "step": 4687 + }, + { + "epoch": 0.6, + "grad_norm": 1.363353967666626, + "learning_rate": 3.6315676491469165e-06, + "loss": 0.5838, + "step": 4688 + }, + { + "epoch": 0.6, + "grad_norm": 1.2268239259719849, + "learning_rate": 3.629571965602858e-06, + "loss": 0.5609, + "step": 4689 + }, + { + "epoch": 0.6, + "grad_norm": 1.1857258081436157, + "learning_rate": 3.6275765180871723e-06, + "loss": 0.6544, + "step": 4690 + }, + { + "epoch": 0.6, + "grad_norm": 1.2621301412582397, + "learning_rate": 3.625581306943537e-06, + "loss": 0.5046, + "step": 4691 + }, + { + "epoch": 0.6, + "grad_norm": 1.828747034072876, + "learning_rate": 3.623586332515584e-06, + "loss": 0.6152, + "step": 4692 + }, + { + "epoch": 0.6, + "grad_norm": 1.2840453386306763, + "learning_rate": 3.6215915951469105e-06, + "loss": 0.7242, + "step": 4693 + }, + { + "epoch": 0.6, + "grad_norm": 2.8199918270111084, + "learning_rate": 3.6195970951810653e-06, + "loss": 0.5506, + "step": 4694 + }, + { + "epoch": 0.6, + "grad_norm": 1.6158698797225952, + "learning_rate": 3.6176028329615654e-06, + "loss": 0.6238, + "step": 4695 + }, + { + "epoch": 0.6, + "grad_norm": 1.960234522819519, + "learning_rate": 3.615608808831877e-06, + "loss": 0.6093, + "step": 4696 + }, + { + "epoch": 0.6, + "grad_norm": 1.1626479625701904, + "learning_rate": 3.6136150231354317e-06, + "loss": 0.6119, + "step": 4697 + }, + { + "epoch": 0.6, + "grad_norm": 1.4083420038223267, + "learning_rate": 3.611621476215621e-06, + "loss": 0.5713, + "step": 4698 + }, + { + "epoch": 0.6, + "grad_norm": 1.3749487400054932, + "learning_rate": 3.609628168415791e-06, + "loss": 0.6795, + "step": 4699 + }, + { + "epoch": 0.6, + "grad_norm": 1.4019120931625366, + "learning_rate": 3.6076351000792487e-06, + "loss": 0.643, + "step": 4700 + }, + { + "epoch": 0.6, + "grad_norm": 1.2229111194610596, + "learning_rate": 3.60564227154926e-06, + "loss": 0.6225, + "step": 4701 + }, + { + "epoch": 0.6, + "grad_norm": 1.121282696723938, + "learning_rate": 3.6036496831690483e-06, + "loss": 0.5681, + "step": 4702 + }, + { + "epoch": 0.6, + "grad_norm": 1.2437260150909424, + "learning_rate": 3.601657335281797e-06, + "loss": 0.6019, + "step": 4703 + }, + { + "epoch": 0.6, + "grad_norm": 1.2536563873291016, + "learning_rate": 3.5996652282306467e-06, + "loss": 0.6497, + "step": 4704 + }, + { + "epoch": 0.6, + "grad_norm": 1.2499635219573975, + "learning_rate": 3.5976733623586986e-06, + "loss": 0.574, + "step": 4705 + }, + { + "epoch": 0.6, + "grad_norm": 1.0934405326843262, + "learning_rate": 3.5956817380090092e-06, + "loss": 0.617, + "step": 4706 + }, + { + "epoch": 0.6, + "grad_norm": 1.4827922582626343, + "learning_rate": 3.5936903555245983e-06, + "loss": 0.6322, + "step": 4707 + }, + { + "epoch": 0.6, + "grad_norm": 1.30734121799469, + "learning_rate": 3.5916992152484382e-06, + "loss": 0.6527, + "step": 4708 + }, + { + "epoch": 0.6, + "grad_norm": 1.2423964738845825, + "learning_rate": 3.5897083175234644e-06, + "loss": 0.632, + "step": 4709 + }, + { + "epoch": 0.6, + "grad_norm": 1.3838889598846436, + "learning_rate": 3.587717662692567e-06, + "loss": 0.6033, + "step": 4710 + }, + { + "epoch": 0.6, + "grad_norm": 1.2591181993484497, + "learning_rate": 3.5857272510985964e-06, + "loss": 0.6277, + "step": 4711 + }, + { + "epoch": 0.6, + "grad_norm": 1.1490346193313599, + "learning_rate": 3.583737083084361e-06, + "loss": 0.5655, + "step": 4712 + }, + { + "epoch": 0.6, + "grad_norm": 1.445289134979248, + "learning_rate": 3.5817471589926266e-06, + "loss": 0.6682, + "step": 4713 + }, + { + "epoch": 0.6, + "grad_norm": 1.6664557456970215, + "learning_rate": 3.579757479166115e-06, + "loss": 0.5634, + "step": 4714 + }, + { + "epoch": 0.6, + "grad_norm": 1.124857783317566, + "learning_rate": 3.5777680439475115e-06, + "loss": 0.7383, + "step": 4715 + }, + { + "epoch": 0.6, + "grad_norm": 1.228276014328003, + "learning_rate": 3.5757788536794547e-06, + "loss": 0.5617, + "step": 4716 + }, + { + "epoch": 0.6, + "grad_norm": 1.1751571893692017, + "learning_rate": 3.573789908704542e-06, + "loss": 0.5534, + "step": 4717 + }, + { + "epoch": 0.6, + "grad_norm": 1.3536722660064697, + "learning_rate": 3.5718012093653294e-06, + "loss": 0.568, + "step": 4718 + }, + { + "epoch": 0.6, + "grad_norm": 1.338544249534607, + "learning_rate": 3.5698127560043277e-06, + "loss": 0.554, + "step": 4719 + }, + { + "epoch": 0.6, + "grad_norm": 1.0579848289489746, + "learning_rate": 3.567824548964011e-06, + "loss": 0.4984, + "step": 4720 + }, + { + "epoch": 0.6, + "grad_norm": 1.7330485582351685, + "learning_rate": 3.565836588586804e-06, + "loss": 0.6076, + "step": 4721 + }, + { + "epoch": 0.6, + "grad_norm": 1.2623368501663208, + "learning_rate": 3.5638488752150956e-06, + "loss": 0.6481, + "step": 4722 + }, + { + "epoch": 0.61, + "grad_norm": 1.1102206707000732, + "learning_rate": 3.5618614091912264e-06, + "loss": 0.5571, + "step": 4723 + }, + { + "epoch": 0.61, + "grad_norm": 1.2276370525360107, + "learning_rate": 3.559874190857499e-06, + "loss": 0.7092, + "step": 4724 + }, + { + "epoch": 0.61, + "grad_norm": 1.190426230430603, + "learning_rate": 3.55788722055617e-06, + "loss": 0.6076, + "step": 4725 + }, + { + "epoch": 0.61, + "grad_norm": 1.1113755702972412, + "learning_rate": 3.555900498629456e-06, + "loss": 0.6253, + "step": 4726 + }, + { + "epoch": 0.61, + "grad_norm": 1.146645426750183, + "learning_rate": 3.5539140254195277e-06, + "loss": 0.5932, + "step": 4727 + }, + { + "epoch": 0.61, + "grad_norm": 1.3581435680389404, + "learning_rate": 3.5519278012685164e-06, + "loss": 0.6639, + "step": 4728 + }, + { + "epoch": 0.61, + "grad_norm": 1.19074285030365, + "learning_rate": 3.5499418265185082e-06, + "loss": 0.6644, + "step": 4729 + }, + { + "epoch": 0.61, + "grad_norm": 1.2494276762008667, + "learning_rate": 3.547956101511547e-06, + "loss": 0.5994, + "step": 4730 + }, + { + "epoch": 0.61, + "grad_norm": 1.3655177354812622, + "learning_rate": 3.545970626589631e-06, + "loss": 0.6564, + "step": 4731 + }, + { + "epoch": 0.61, + "grad_norm": 1.3489915132522583, + "learning_rate": 3.5439854020947216e-06, + "loss": 0.7015, + "step": 4732 + }, + { + "epoch": 0.61, + "grad_norm": 1.1399732828140259, + "learning_rate": 3.5420004283687338e-06, + "loss": 0.594, + "step": 4733 + }, + { + "epoch": 0.61, + "grad_norm": 1.2338383197784424, + "learning_rate": 3.5400157057535354e-06, + "loss": 0.6912, + "step": 4734 + }, + { + "epoch": 0.61, + "grad_norm": 1.1922136545181274, + "learning_rate": 3.5380312345909582e-06, + "loss": 0.5775, + "step": 4735 + }, + { + "epoch": 0.61, + "grad_norm": 1.2153799533843994, + "learning_rate": 3.5360470152227846e-06, + "loss": 0.5943, + "step": 4736 + }, + { + "epoch": 0.61, + "grad_norm": 1.3712053298950195, + "learning_rate": 3.5340630479907567e-06, + "loss": 0.6236, + "step": 4737 + }, + { + "epoch": 0.61, + "grad_norm": 1.0961687564849854, + "learning_rate": 3.532079333236571e-06, + "loss": 0.5619, + "step": 4738 + }, + { + "epoch": 0.61, + "grad_norm": 1.1054195165634155, + "learning_rate": 3.5300958713018858e-06, + "loss": 0.5613, + "step": 4739 + }, + { + "epoch": 0.61, + "grad_norm": 1.196414589881897, + "learning_rate": 3.528112662528308e-06, + "loss": 0.6143, + "step": 4740 + }, + { + "epoch": 0.61, + "grad_norm": 1.1598870754241943, + "learning_rate": 3.5261297072574085e-06, + "loss": 0.6768, + "step": 4741 + }, + { + "epoch": 0.61, + "grad_norm": 1.3299384117126465, + "learning_rate": 3.524147005830708e-06, + "loss": 0.6039, + "step": 4742 + }, + { + "epoch": 0.61, + "grad_norm": 1.8477388620376587, + "learning_rate": 3.522164558589689e-06, + "loss": 0.5782, + "step": 4743 + }, + { + "epoch": 0.61, + "grad_norm": 1.4437540769577026, + "learning_rate": 3.5201823658757855e-06, + "loss": 0.6431, + "step": 4744 + }, + { + "epoch": 0.61, + "grad_norm": 1.136198878288269, + "learning_rate": 3.5182004280303927e-06, + "loss": 0.5964, + "step": 4745 + }, + { + "epoch": 0.61, + "grad_norm": 1.2844723463058472, + "learning_rate": 3.516218745394857e-06, + "loss": 0.5537, + "step": 4746 + }, + { + "epoch": 0.61, + "grad_norm": 1.1594425439834595, + "learning_rate": 3.5142373183104823e-06, + "loss": 0.7255, + "step": 4747 + }, + { + "epoch": 0.61, + "grad_norm": 1.6344101428985596, + "learning_rate": 3.5122561471185292e-06, + "loss": 0.5981, + "step": 4748 + }, + { + "epoch": 0.61, + "grad_norm": 1.394809603691101, + "learning_rate": 3.5102752321602163e-06, + "loss": 0.5609, + "step": 4749 + }, + { + "epoch": 0.61, + "grad_norm": 2.0609498023986816, + "learning_rate": 3.508294573776716e-06, + "loss": 0.5798, + "step": 4750 + }, + { + "epoch": 0.61, + "grad_norm": 1.3028736114501953, + "learning_rate": 3.5063141723091555e-06, + "loss": 0.5482, + "step": 4751 + }, + { + "epoch": 0.61, + "grad_norm": 1.1946570873260498, + "learning_rate": 3.504334028098617e-06, + "loss": 0.5992, + "step": 4752 + }, + { + "epoch": 0.61, + "grad_norm": 1.6412652730941772, + "learning_rate": 3.502354141486143e-06, + "loss": 0.6044, + "step": 4753 + }, + { + "epoch": 0.61, + "grad_norm": 2.494856119155884, + "learning_rate": 3.5003745128127263e-06, + "loss": 0.5425, + "step": 4754 + }, + { + "epoch": 0.61, + "grad_norm": 1.1414670944213867, + "learning_rate": 3.4983951424193196e-06, + "loss": 0.756, + "step": 4755 + }, + { + "epoch": 0.61, + "grad_norm": 1.2341411113739014, + "learning_rate": 3.4964160306468274e-06, + "loss": 0.5684, + "step": 4756 + }, + { + "epoch": 0.61, + "grad_norm": 1.4616705179214478, + "learning_rate": 3.4944371778361143e-06, + "loss": 0.5613, + "step": 4757 + }, + { + "epoch": 0.61, + "grad_norm": 1.233243465423584, + "learning_rate": 3.4924585843279933e-06, + "loss": 0.6797, + "step": 4758 + }, + { + "epoch": 0.61, + "grad_norm": 1.328782081604004, + "learning_rate": 3.4904802504632406e-06, + "loss": 0.6308, + "step": 4759 + }, + { + "epoch": 0.61, + "grad_norm": 1.412166714668274, + "learning_rate": 3.488502176582582e-06, + "loss": 0.5519, + "step": 4760 + }, + { + "epoch": 0.61, + "grad_norm": 1.19275963306427, + "learning_rate": 3.4865243630267022e-06, + "loss": 0.5534, + "step": 4761 + }, + { + "epoch": 0.61, + "grad_norm": 1.3603435754776, + "learning_rate": 3.484546810136237e-06, + "loss": 0.71, + "step": 4762 + }, + { + "epoch": 0.61, + "grad_norm": 1.1178488731384277, + "learning_rate": 3.4825695182517823e-06, + "loss": 0.6962, + "step": 4763 + }, + { + "epoch": 0.61, + "grad_norm": 1.11331307888031, + "learning_rate": 3.4805924877138837e-06, + "loss": 0.6168, + "step": 4764 + }, + { + "epoch": 0.61, + "grad_norm": 1.4276987314224243, + "learning_rate": 3.478615718863047e-06, + "loss": 0.6107, + "step": 4765 + }, + { + "epoch": 0.61, + "grad_norm": 1.1094125509262085, + "learning_rate": 3.4766392120397274e-06, + "loss": 0.5935, + "step": 4766 + }, + { + "epoch": 0.61, + "grad_norm": 1.393486738204956, + "learning_rate": 3.4746629675843413e-06, + "loss": 0.6608, + "step": 4767 + }, + { + "epoch": 0.61, + "grad_norm": 1.3646267652511597, + "learning_rate": 3.472686985837257e-06, + "loss": 0.7118, + "step": 4768 + }, + { + "epoch": 0.61, + "grad_norm": 1.2422285079956055, + "learning_rate": 3.470711267138794e-06, + "loss": 0.5005, + "step": 4769 + }, + { + "epoch": 0.61, + "grad_norm": 1.4949270486831665, + "learning_rate": 3.4687358118292325e-06, + "loss": 0.5937, + "step": 4770 + }, + { + "epoch": 0.61, + "grad_norm": 1.2646280527114868, + "learning_rate": 3.4667606202488014e-06, + "loss": 0.5595, + "step": 4771 + }, + { + "epoch": 0.61, + "grad_norm": 1.8542555570602417, + "learning_rate": 3.464785692737691e-06, + "loss": 0.6481, + "step": 4772 + }, + { + "epoch": 0.61, + "grad_norm": 1.4289233684539795, + "learning_rate": 3.4628110296360394e-06, + "loss": 0.6218, + "step": 4773 + }, + { + "epoch": 0.61, + "grad_norm": 1.189062237739563, + "learning_rate": 3.460836631283945e-06, + "loss": 0.5669, + "step": 4774 + }, + { + "epoch": 0.61, + "grad_norm": 1.6110448837280273, + "learning_rate": 3.4588624980214547e-06, + "loss": 0.6313, + "step": 4775 + }, + { + "epoch": 0.61, + "grad_norm": 1.2245473861694336, + "learning_rate": 3.4568886301885753e-06, + "loss": 0.651, + "step": 4776 + }, + { + "epoch": 0.61, + "grad_norm": 1.3276503086090088, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.6339, + "step": 4777 + }, + { + "epoch": 0.61, + "grad_norm": 1.3305314779281616, + "learning_rate": 3.4529416921714344e-06, + "loss": 0.6083, + "step": 4778 + }, + { + "epoch": 0.61, + "grad_norm": 1.6480473279953003, + "learning_rate": 3.450968622666952e-06, + "loss": 0.6153, + "step": 4779 + }, + { + "epoch": 0.61, + "grad_norm": 1.2591640949249268, + "learning_rate": 3.4489958199516404e-06, + "loss": 0.7308, + "step": 4780 + }, + { + "epoch": 0.61, + "grad_norm": 1.382493019104004, + "learning_rate": 3.4470232843652728e-06, + "loss": 0.6487, + "step": 4781 + }, + { + "epoch": 0.61, + "grad_norm": 1.3461976051330566, + "learning_rate": 3.4450510162475797e-06, + "loss": 0.664, + "step": 4782 + }, + { + "epoch": 0.61, + "grad_norm": 1.8919711112976074, + "learning_rate": 3.4430790159382414e-06, + "loss": 0.6032, + "step": 4783 + }, + { + "epoch": 0.61, + "grad_norm": 1.4023423194885254, + "learning_rate": 3.441107283776899e-06, + "loss": 0.6103, + "step": 4784 + }, + { + "epoch": 0.61, + "grad_norm": 1.211699366569519, + "learning_rate": 3.4391358201031412e-06, + "loss": 0.5951, + "step": 4785 + }, + { + "epoch": 0.61, + "grad_norm": 1.2939444780349731, + "learning_rate": 3.4371646252565114e-06, + "loss": 0.6142, + "step": 4786 + }, + { + "epoch": 0.61, + "grad_norm": 1.2708574533462524, + "learning_rate": 3.4351936995765112e-06, + "loss": 0.5485, + "step": 4787 + }, + { + "epoch": 0.61, + "grad_norm": 1.2561215162277222, + "learning_rate": 3.4332230434025887e-06, + "loss": 0.5625, + "step": 4788 + }, + { + "epoch": 0.61, + "grad_norm": 1.0576465129852295, + "learning_rate": 3.4312526570741524e-06, + "loss": 0.693, + "step": 4789 + }, + { + "epoch": 0.61, + "grad_norm": 1.220123052597046, + "learning_rate": 3.4292825409305586e-06, + "loss": 0.641, + "step": 4790 + }, + { + "epoch": 0.61, + "grad_norm": 1.1228139400482178, + "learning_rate": 3.4273126953111226e-06, + "loss": 0.7457, + "step": 4791 + }, + { + "epoch": 0.61, + "grad_norm": 1.302809715270996, + "learning_rate": 3.425343120555107e-06, + "loss": 0.6077, + "step": 4792 + }, + { + "epoch": 0.61, + "grad_norm": 1.169256567955017, + "learning_rate": 3.4233738170017338e-06, + "loss": 0.6243, + "step": 4793 + }, + { + "epoch": 0.61, + "grad_norm": 1.4685825109481812, + "learning_rate": 3.4214047849901743e-06, + "loss": 0.566, + "step": 4794 + }, + { + "epoch": 0.61, + "grad_norm": 1.632744550704956, + "learning_rate": 3.4194360248595547e-06, + "loss": 0.5817, + "step": 4795 + }, + { + "epoch": 0.61, + "grad_norm": 1.2418441772460938, + "learning_rate": 3.417467536948954e-06, + "loss": 0.5698, + "step": 4796 + }, + { + "epoch": 0.61, + "grad_norm": 1.720788598060608, + "learning_rate": 3.415499321597403e-06, + "loss": 0.6484, + "step": 4797 + }, + { + "epoch": 0.61, + "grad_norm": 1.3634127378463745, + "learning_rate": 3.4135313791438885e-06, + "loss": 0.7072, + "step": 4798 + }, + { + "epoch": 0.61, + "grad_norm": 1.0684276819229126, + "learning_rate": 3.411563709927347e-06, + "loss": 0.6948, + "step": 4799 + }, + { + "epoch": 0.61, + "grad_norm": 1.3760558366775513, + "learning_rate": 3.409596314286669e-06, + "loss": 0.5563, + "step": 4800 + }, + { + "epoch": 0.62, + "grad_norm": 1.910668969154358, + "learning_rate": 3.4076291925607017e-06, + "loss": 0.692, + "step": 4801 + }, + { + "epoch": 0.62, + "grad_norm": 1.1838176250457764, + "learning_rate": 3.4056623450882388e-06, + "loss": 0.6427, + "step": 4802 + }, + { + "epoch": 0.62, + "grad_norm": 1.3909626007080078, + "learning_rate": 3.403695772208032e-06, + "loss": 0.5647, + "step": 4803 + }, + { + "epoch": 0.62, + "grad_norm": 1.0790553092956543, + "learning_rate": 3.4017294742587812e-06, + "loss": 0.6149, + "step": 4804 + }, + { + "epoch": 0.62, + "grad_norm": 1.1320269107818604, + "learning_rate": 3.399763451579144e-06, + "loss": 0.7715, + "step": 4805 + }, + { + "epoch": 0.62, + "grad_norm": 1.1932052373886108, + "learning_rate": 3.3977977045077247e-06, + "loss": 0.6292, + "step": 4806 + }, + { + "epoch": 0.62, + "grad_norm": 1.2008845806121826, + "learning_rate": 3.3958322333830864e-06, + "loss": 0.6292, + "step": 4807 + }, + { + "epoch": 0.62, + "grad_norm": 1.3319114446640015, + "learning_rate": 3.393867038543738e-06, + "loss": 0.5772, + "step": 4808 + }, + { + "epoch": 0.62, + "grad_norm": 1.5973466634750366, + "learning_rate": 3.3919021203281475e-06, + "loss": 0.6426, + "step": 4809 + }, + { + "epoch": 0.62, + "grad_norm": 1.542841911315918, + "learning_rate": 3.389937479074731e-06, + "loss": 0.5787, + "step": 4810 + }, + { + "epoch": 0.62, + "grad_norm": 1.9632922410964966, + "learning_rate": 3.3879731151218575e-06, + "loss": 0.5883, + "step": 4811 + }, + { + "epoch": 0.62, + "grad_norm": 1.3410135507583618, + "learning_rate": 3.3860090288078496e-06, + "loss": 0.6418, + "step": 4812 + }, + { + "epoch": 0.62, + "grad_norm": 1.061713695526123, + "learning_rate": 3.3840452204709806e-06, + "loss": 0.6125, + "step": 4813 + }, + { + "epoch": 0.62, + "grad_norm": 1.0856437683105469, + "learning_rate": 3.382081690449477e-06, + "loss": 0.5871, + "step": 4814 + }, + { + "epoch": 0.62, + "grad_norm": 1.18121337890625, + "learning_rate": 3.3801184390815173e-06, + "loss": 0.5499, + "step": 4815 + }, + { + "epoch": 0.62, + "grad_norm": 1.4576302766799927, + "learning_rate": 3.37815546670523e-06, + "loss": 0.5686, + "step": 4816 + }, + { + "epoch": 0.62, + "grad_norm": 0.9863116145133972, + "learning_rate": 3.3761927736586976e-06, + "loss": 0.58, + "step": 4817 + }, + { + "epoch": 0.62, + "grad_norm": 1.1071988344192505, + "learning_rate": 3.3742303602799565e-06, + "loss": 0.6325, + "step": 4818 + }, + { + "epoch": 0.62, + "grad_norm": 1.518639087677002, + "learning_rate": 3.3722682269069906e-06, + "loss": 0.5877, + "step": 4819 + }, + { + "epoch": 0.62, + "grad_norm": 1.3329269886016846, + "learning_rate": 3.370306373877738e-06, + "loss": 0.5854, + "step": 4820 + }, + { + "epoch": 0.62, + "grad_norm": 1.4260133504867554, + "learning_rate": 3.368344801530087e-06, + "loss": 0.6094, + "step": 4821 + }, + { + "epoch": 0.62, + "grad_norm": 1.498528242111206, + "learning_rate": 3.3663835102018803e-06, + "loss": 0.5885, + "step": 4822 + }, + { + "epoch": 0.62, + "grad_norm": 1.4872510433197021, + "learning_rate": 3.364422500230908e-06, + "loss": 0.581, + "step": 4823 + }, + { + "epoch": 0.62, + "grad_norm": 1.251771092414856, + "learning_rate": 3.3624617719549178e-06, + "loss": 0.6102, + "step": 4824 + }, + { + "epoch": 0.62, + "grad_norm": 1.566983938217163, + "learning_rate": 3.3605013257116016e-06, + "loss": 0.6619, + "step": 4825 + }, + { + "epoch": 0.62, + "grad_norm": 1.4458528757095337, + "learning_rate": 3.3585411618386086e-06, + "loss": 0.5325, + "step": 4826 + }, + { + "epoch": 0.62, + "grad_norm": 1.2375273704528809, + "learning_rate": 3.356581280673536e-06, + "loss": 0.5656, + "step": 4827 + }, + { + "epoch": 0.62, + "grad_norm": 1.2488701343536377, + "learning_rate": 3.3546216825539347e-06, + "loss": 0.5597, + "step": 4828 + }, + { + "epoch": 0.62, + "grad_norm": 1.3934226036071777, + "learning_rate": 3.3526623678173043e-06, + "loss": 0.5244, + "step": 4829 + }, + { + "epoch": 0.62, + "grad_norm": 1.497615098953247, + "learning_rate": 3.350703336801099e-06, + "loss": 0.5815, + "step": 4830 + }, + { + "epoch": 0.62, + "grad_norm": 1.1210520267486572, + "learning_rate": 3.3487445898427195e-06, + "loss": 0.5517, + "step": 4831 + }, + { + "epoch": 0.62, + "grad_norm": 1.2002229690551758, + "learning_rate": 3.346786127279522e-06, + "loss": 0.6144, + "step": 4832 + }, + { + "epoch": 0.62, + "grad_norm": 1.2714262008666992, + "learning_rate": 3.34482794944881e-06, + "loss": 0.5459, + "step": 4833 + }, + { + "epoch": 0.62, + "grad_norm": 1.2544283866882324, + "learning_rate": 3.3428700566878407e-06, + "loss": 0.5844, + "step": 4834 + }, + { + "epoch": 0.62, + "grad_norm": 1.3227342367172241, + "learning_rate": 3.340912449333824e-06, + "loss": 0.5983, + "step": 4835 + }, + { + "epoch": 0.62, + "grad_norm": 1.1427699327468872, + "learning_rate": 3.3389551277239143e-06, + "loss": 0.5442, + "step": 4836 + }, + { + "epoch": 0.62, + "grad_norm": 1.4273881912231445, + "learning_rate": 3.3369980921952227e-06, + "loss": 0.5977, + "step": 4837 + }, + { + "epoch": 0.62, + "grad_norm": 1.286092758178711, + "learning_rate": 3.335041343084807e-06, + "loss": 0.7324, + "step": 4838 + }, + { + "epoch": 0.62, + "grad_norm": 1.303217887878418, + "learning_rate": 3.3330848807296796e-06, + "loss": 0.6638, + "step": 4839 + }, + { + "epoch": 0.62, + "grad_norm": 1.4378916025161743, + "learning_rate": 3.331128705466799e-06, + "loss": 0.63, + "step": 4840 + }, + { + "epoch": 0.62, + "grad_norm": 1.227399468421936, + "learning_rate": 3.3291728176330786e-06, + "loss": 0.625, + "step": 4841 + }, + { + "epoch": 0.62, + "grad_norm": 1.3684594631195068, + "learning_rate": 3.327217217565379e-06, + "loss": 0.6153, + "step": 4842 + }, + { + "epoch": 0.62, + "grad_norm": 1.5865708589553833, + "learning_rate": 3.325261905600514e-06, + "loss": 0.6505, + "step": 4843 + }, + { + "epoch": 0.62, + "grad_norm": 1.1837784051895142, + "learning_rate": 3.3233068820752447e-06, + "loss": 0.5805, + "step": 4844 + }, + { + "epoch": 0.62, + "grad_norm": 1.0995562076568604, + "learning_rate": 3.321352147326285e-06, + "loss": 0.7422, + "step": 4845 + }, + { + "epoch": 0.62, + "grad_norm": 1.242751121520996, + "learning_rate": 3.3193977016902988e-06, + "loss": 0.6068, + "step": 4846 + }, + { + "epoch": 0.62, + "grad_norm": 1.5467709302902222, + "learning_rate": 3.317443545503898e-06, + "loss": 0.5951, + "step": 4847 + }, + { + "epoch": 0.62, + "grad_norm": 1.178579568862915, + "learning_rate": 3.315489679103648e-06, + "loss": 0.515, + "step": 4848 + }, + { + "epoch": 0.62, + "grad_norm": 1.3723570108413696, + "learning_rate": 3.3135361028260604e-06, + "loss": 0.5833, + "step": 4849 + }, + { + "epoch": 0.62, + "grad_norm": 2.414771795272827, + "learning_rate": 3.3115828170076026e-06, + "loss": 0.6358, + "step": 4850 + }, + { + "epoch": 0.62, + "grad_norm": 1.168471336364746, + "learning_rate": 3.3096298219846835e-06, + "loss": 0.5542, + "step": 4851 + }, + { + "epoch": 0.62, + "grad_norm": 1.5970467329025269, + "learning_rate": 3.3076771180936707e-06, + "loss": 0.7044, + "step": 4852 + }, + { + "epoch": 0.62, + "grad_norm": 1.1945412158966064, + "learning_rate": 3.305724705670877e-06, + "loss": 0.5253, + "step": 4853 + }, + { + "epoch": 0.62, + "grad_norm": 1.2077255249023438, + "learning_rate": 3.3037725850525648e-06, + "loss": 0.6126, + "step": 4854 + }, + { + "epoch": 0.62, + "grad_norm": 1.2469735145568848, + "learning_rate": 3.3018207565749484e-06, + "loss": 0.5933, + "step": 4855 + }, + { + "epoch": 0.62, + "grad_norm": 1.3545241355895996, + "learning_rate": 3.2998692205741893e-06, + "loss": 0.5633, + "step": 4856 + }, + { + "epoch": 0.62, + "grad_norm": 1.2162749767303467, + "learning_rate": 3.2979179773864013e-06, + "loss": 0.5882, + "step": 4857 + }, + { + "epoch": 0.62, + "grad_norm": 1.2614243030548096, + "learning_rate": 3.295967027347645e-06, + "loss": 0.5937, + "step": 4858 + }, + { + "epoch": 0.62, + "grad_norm": 1.402371883392334, + "learning_rate": 3.2940163707939333e-06, + "loss": 0.5585, + "step": 4859 + }, + { + "epoch": 0.62, + "grad_norm": 1.107991099357605, + "learning_rate": 3.2920660080612245e-06, + "loss": 0.6597, + "step": 4860 + }, + { + "epoch": 0.62, + "grad_norm": 1.2759180068969727, + "learning_rate": 3.2901159394854324e-06, + "loss": 0.5066, + "step": 4861 + }, + { + "epoch": 0.62, + "grad_norm": 1.3145313262939453, + "learning_rate": 3.2881661654024144e-06, + "loss": 0.5645, + "step": 4862 + }, + { + "epoch": 0.62, + "grad_norm": 1.1362555027008057, + "learning_rate": 3.2862166861479806e-06, + "loss": 0.5143, + "step": 4863 + }, + { + "epoch": 0.62, + "grad_norm": 1.101493000984192, + "learning_rate": 3.284267502057888e-06, + "loss": 0.6588, + "step": 4864 + }, + { + "epoch": 0.62, + "grad_norm": 1.3010179996490479, + "learning_rate": 3.2823186134678455e-06, + "loss": 0.5626, + "step": 4865 + }, + { + "epoch": 0.62, + "grad_norm": 1.1528490781784058, + "learning_rate": 3.280370020713507e-06, + "loss": 0.6245, + "step": 4866 + }, + { + "epoch": 0.62, + "grad_norm": 1.2665293216705322, + "learning_rate": 3.2784217241304815e-06, + "loss": 0.5826, + "step": 4867 + }, + { + "epoch": 0.62, + "grad_norm": 1.4382033348083496, + "learning_rate": 3.2764737240543192e-06, + "loss": 0.565, + "step": 4868 + }, + { + "epoch": 0.62, + "grad_norm": 1.322492003440857, + "learning_rate": 3.2745260208205273e-06, + "loss": 0.5892, + "step": 4869 + }, + { + "epoch": 0.62, + "grad_norm": 1.3495713472366333, + "learning_rate": 3.2725786147645577e-06, + "loss": 0.6144, + "step": 4870 + }, + { + "epoch": 0.62, + "grad_norm": 1.2086135149002075, + "learning_rate": 3.2706315062218085e-06, + "loss": 0.5433, + "step": 4871 + }, + { + "epoch": 0.62, + "grad_norm": 1.6504979133605957, + "learning_rate": 3.268684695527634e-06, + "loss": 0.5627, + "step": 4872 + }, + { + "epoch": 0.62, + "grad_norm": 1.2472084760665894, + "learning_rate": 3.2667381830173287e-06, + "loss": 0.5854, + "step": 4873 + }, + { + "epoch": 0.62, + "grad_norm": 1.8455642461776733, + "learning_rate": 3.2647919690261433e-06, + "loss": 0.5472, + "step": 4874 + }, + { + "epoch": 0.62, + "grad_norm": 1.2269060611724854, + "learning_rate": 3.26284605388927e-06, + "loss": 0.5151, + "step": 4875 + }, + { + "epoch": 0.62, + "grad_norm": 0.9887154698371887, + "learning_rate": 3.2609004379418564e-06, + "loss": 0.6177, + "step": 4876 + }, + { + "epoch": 0.62, + "grad_norm": 1.5012866258621216, + "learning_rate": 3.2589551215189925e-06, + "loss": 0.694, + "step": 4877 + }, + { + "epoch": 0.62, + "grad_norm": 1.2509353160858154, + "learning_rate": 3.257010104955722e-06, + "loss": 0.6014, + "step": 4878 + }, + { + "epoch": 0.63, + "grad_norm": 1.3519951105117798, + "learning_rate": 3.255065388587032e-06, + "loss": 0.6751, + "step": 4879 + }, + { + "epoch": 0.63, + "grad_norm": 1.2042193412780762, + "learning_rate": 3.253120972747863e-06, + "loss": 0.5931, + "step": 4880 + }, + { + "epoch": 0.63, + "grad_norm": 1.2597451210021973, + "learning_rate": 3.251176857773099e-06, + "loss": 0.6325, + "step": 4881 + }, + { + "epoch": 0.63, + "grad_norm": 1.296736478805542, + "learning_rate": 3.249233043997576e-06, + "loss": 0.6089, + "step": 4882 + }, + { + "epoch": 0.63, + "grad_norm": 1.2353603839874268, + "learning_rate": 3.2472895317560744e-06, + "loss": 0.6336, + "step": 4883 + }, + { + "epoch": 0.63, + "grad_norm": 1.2520846128463745, + "learning_rate": 3.2453463213833267e-06, + "loss": 0.5977, + "step": 4884 + }, + { + "epoch": 0.63, + "grad_norm": 1.338379979133606, + "learning_rate": 3.2434034132140085e-06, + "loss": 0.6035, + "step": 4885 + }, + { + "epoch": 0.63, + "grad_norm": 1.2667759656906128, + "learning_rate": 3.241460807582749e-06, + "loss": 0.7181, + "step": 4886 + }, + { + "epoch": 0.63, + "grad_norm": 1.2850943803787231, + "learning_rate": 3.2395185048241235e-06, + "loss": 0.4562, + "step": 4887 + }, + { + "epoch": 0.63, + "grad_norm": 1.7538964748382568, + "learning_rate": 3.2375765052726505e-06, + "loss": 0.5707, + "step": 4888 + }, + { + "epoch": 0.63, + "grad_norm": 1.2799808979034424, + "learning_rate": 3.2356348092628038e-06, + "loss": 0.692, + "step": 4889 + }, + { + "epoch": 0.63, + "grad_norm": 1.681443452835083, + "learning_rate": 3.2336934171289974e-06, + "loss": 0.5608, + "step": 4890 + }, + { + "epoch": 0.63, + "grad_norm": 1.5989010334014893, + "learning_rate": 3.2317523292055998e-06, + "loss": 0.5823, + "step": 4891 + }, + { + "epoch": 0.63, + "grad_norm": 1.2030177116394043, + "learning_rate": 3.2298115458269212e-06, + "loss": 0.6447, + "step": 4892 + }, + { + "epoch": 0.63, + "grad_norm": 1.3958220481872559, + "learning_rate": 3.227871067327225e-06, + "loss": 0.6051, + "step": 4893 + }, + { + "epoch": 0.63, + "grad_norm": 1.4750139713287354, + "learning_rate": 3.225930894040717e-06, + "loss": 0.6222, + "step": 4894 + }, + { + "epoch": 0.63, + "grad_norm": 1.5289214849472046, + "learning_rate": 3.2239910263015524e-06, + "loss": 0.626, + "step": 4895 + }, + { + "epoch": 0.63, + "grad_norm": 1.1837035417556763, + "learning_rate": 3.222051464443836e-06, + "loss": 0.5143, + "step": 4896 + }, + { + "epoch": 0.63, + "grad_norm": 1.1531528234481812, + "learning_rate": 3.220112208801615e-06, + "loss": 0.6023, + "step": 4897 + }, + { + "epoch": 0.63, + "grad_norm": 1.0396391153335571, + "learning_rate": 3.21817325970889e-06, + "loss": 0.5225, + "step": 4898 + }, + { + "epoch": 0.63, + "grad_norm": 1.2080007791519165, + "learning_rate": 3.216234617499603e-06, + "loss": 0.4965, + "step": 4899 + }, + { + "epoch": 0.63, + "grad_norm": 1.1739264726638794, + "learning_rate": 3.2142962825076477e-06, + "loss": 0.5772, + "step": 4900 + }, + { + "epoch": 0.63, + "grad_norm": 1.0638878345489502, + "learning_rate": 3.2123582550668608e-06, + "loss": 0.6668, + "step": 4901 + }, + { + "epoch": 0.63, + "grad_norm": 1.3319416046142578, + "learning_rate": 3.210420535511031e-06, + "loss": 0.5559, + "step": 4902 + }, + { + "epoch": 0.63, + "grad_norm": 1.216744065284729, + "learning_rate": 3.2084831241738866e-06, + "loss": 0.4514, + "step": 4903 + }, + { + "epoch": 0.63, + "grad_norm": 1.1501775979995728, + "learning_rate": 3.206546021389111e-06, + "loss": 0.5571, + "step": 4904 + }, + { + "epoch": 0.63, + "grad_norm": 1.2810231447219849, + "learning_rate": 3.2046092274903316e-06, + "loss": 0.7026, + "step": 4905 + }, + { + "epoch": 0.63, + "grad_norm": 1.1379876136779785, + "learning_rate": 3.2026727428111186e-06, + "loss": 0.5862, + "step": 4906 + }, + { + "epoch": 0.63, + "grad_norm": 1.143991231918335, + "learning_rate": 3.200736567684995e-06, + "loss": 0.5613, + "step": 4907 + }, + { + "epoch": 0.63, + "grad_norm": 1.5698902606964111, + "learning_rate": 3.198800702445425e-06, + "loss": 0.5612, + "step": 4908 + }, + { + "epoch": 0.63, + "grad_norm": 1.3888661861419678, + "learning_rate": 3.196865147425824e-06, + "loss": 0.6665, + "step": 4909 + }, + { + "epoch": 0.63, + "grad_norm": 1.4403868913650513, + "learning_rate": 3.19492990295955e-06, + "loss": 0.5879, + "step": 4910 + }, + { + "epoch": 0.63, + "grad_norm": 1.532800316810608, + "learning_rate": 3.1929949693799134e-06, + "loss": 0.7634, + "step": 4911 + }, + { + "epoch": 0.63, + "grad_norm": 1.1193608045578003, + "learning_rate": 3.1910603470201616e-06, + "loss": 0.6415, + "step": 4912 + }, + { + "epoch": 0.63, + "grad_norm": 1.3783601522445679, + "learning_rate": 3.189126036213499e-06, + "loss": 0.6293, + "step": 4913 + }, + { + "epoch": 0.63, + "grad_norm": 1.1927382946014404, + "learning_rate": 3.1871920372930687e-06, + "loss": 0.5678, + "step": 4914 + }, + { + "epoch": 0.63, + "grad_norm": 1.2124135494232178, + "learning_rate": 3.185258350591963e-06, + "loss": 0.6184, + "step": 4915 + }, + { + "epoch": 0.63, + "grad_norm": 1.0941423177719116, + "learning_rate": 3.1833249764432206e-06, + "loss": 0.5018, + "step": 4916 + }, + { + "epoch": 0.63, + "grad_norm": 1.1445602178573608, + "learning_rate": 3.1813919151798265e-06, + "loss": 0.654, + "step": 4917 + }, + { + "epoch": 0.63, + "grad_norm": 1.6601332426071167, + "learning_rate": 3.1794591671347087e-06, + "loss": 0.5667, + "step": 4918 + }, + { + "epoch": 0.63, + "grad_norm": 1.3656716346740723, + "learning_rate": 3.177526732640747e-06, + "loss": 0.5714, + "step": 4919 + }, + { + "epoch": 0.63, + "grad_norm": 1.346827507019043, + "learning_rate": 3.1755946120307605e-06, + "loss": 0.583, + "step": 4920 + }, + { + "epoch": 0.63, + "grad_norm": 1.2119982242584229, + "learning_rate": 3.173662805637521e-06, + "loss": 0.5986, + "step": 4921 + }, + { + "epoch": 0.63, + "grad_norm": 1.3897769451141357, + "learning_rate": 3.1717313137937415e-06, + "loss": 0.5864, + "step": 4922 + }, + { + "epoch": 0.63, + "grad_norm": 1.1082748174667358, + "learning_rate": 3.1698001368320817e-06, + "loss": 0.5421, + "step": 4923 + }, + { + "epoch": 0.63, + "grad_norm": 1.4113824367523193, + "learning_rate": 3.16786927508515e-06, + "loss": 0.588, + "step": 4924 + }, + { + "epoch": 0.63, + "grad_norm": 1.2338011264801025, + "learning_rate": 3.1659387288854937e-06, + "loss": 0.5927, + "step": 4925 + }, + { + "epoch": 0.63, + "grad_norm": 1.165490746498108, + "learning_rate": 3.164008498565615e-06, + "loss": 0.5956, + "step": 4926 + }, + { + "epoch": 0.63, + "grad_norm": 1.206478476524353, + "learning_rate": 3.1620785844579526e-06, + "loss": 0.589, + "step": 4927 + }, + { + "epoch": 0.63, + "grad_norm": 1.202146291732788, + "learning_rate": 3.160148986894899e-06, + "loss": 0.6487, + "step": 4928 + }, + { + "epoch": 0.63, + "grad_norm": 1.1670382022857666, + "learning_rate": 3.1582197062087837e-06, + "loss": 0.495, + "step": 4929 + }, + { + "epoch": 0.63, + "grad_norm": 1.248085618019104, + "learning_rate": 3.15629074273189e-06, + "loss": 0.6223, + "step": 4930 + }, + { + "epoch": 0.63, + "grad_norm": 1.2666771411895752, + "learning_rate": 3.15436209679644e-06, + "loss": 0.6845, + "step": 4931 + }, + { + "epoch": 0.63, + "grad_norm": 1.1898845434188843, + "learning_rate": 3.1524337687346065e-06, + "loss": 0.5541, + "step": 4932 + }, + { + "epoch": 0.63, + "grad_norm": 1.34114670753479, + "learning_rate": 3.150505758878501e-06, + "loss": 0.5768, + "step": 4933 + }, + { + "epoch": 0.63, + "grad_norm": 1.2350118160247803, + "learning_rate": 3.1485780675601878e-06, + "loss": 0.5804, + "step": 4934 + }, + { + "epoch": 0.63, + "grad_norm": 1.3063546419143677, + "learning_rate": 3.1466506951116697e-06, + "loss": 0.6739, + "step": 4935 + }, + { + "epoch": 0.63, + "grad_norm": 1.5891927480697632, + "learning_rate": 3.1447236418648997e-06, + "loss": 0.5966, + "step": 4936 + }, + { + "epoch": 0.63, + "grad_norm": 1.3554737567901611, + "learning_rate": 3.1427969081517705e-06, + "loss": 0.5389, + "step": 4937 + }, + { + "epoch": 0.63, + "grad_norm": 1.4271050691604614, + "learning_rate": 3.1408704943041257e-06, + "loss": 0.6583, + "step": 4938 + }, + { + "epoch": 0.63, + "grad_norm": 1.2718795537948608, + "learning_rate": 3.1389444006537517e-06, + "loss": 0.5947, + "step": 4939 + }, + { + "epoch": 0.63, + "grad_norm": 1.1423945426940918, + "learning_rate": 3.1370186275323756e-06, + "loss": 0.5624, + "step": 4940 + }, + { + "epoch": 0.63, + "grad_norm": 1.107884168624878, + "learning_rate": 3.135093175271676e-06, + "loss": 0.6333, + "step": 4941 + }, + { + "epoch": 0.63, + "grad_norm": 1.4218220710754395, + "learning_rate": 3.1331680442032697e-06, + "loss": 0.6028, + "step": 4942 + }, + { + "epoch": 0.63, + "grad_norm": 1.4798823595046997, + "learning_rate": 3.131243234658724e-06, + "loss": 0.5927, + "step": 4943 + }, + { + "epoch": 0.63, + "grad_norm": 1.1230974197387695, + "learning_rate": 3.1293187469695472e-06, + "loss": 0.5974, + "step": 4944 + }, + { + "epoch": 0.63, + "grad_norm": 1.2540388107299805, + "learning_rate": 3.127394581467193e-06, + "loss": 0.5366, + "step": 4945 + }, + { + "epoch": 0.63, + "grad_norm": 1.489526629447937, + "learning_rate": 3.1254707384830607e-06, + "loss": 0.6562, + "step": 4946 + }, + { + "epoch": 0.63, + "grad_norm": 1.336727261543274, + "learning_rate": 3.123547218348491e-06, + "loss": 0.6517, + "step": 4947 + }, + { + "epoch": 0.63, + "grad_norm": 1.2091395854949951, + "learning_rate": 3.121624021394774e-06, + "loss": 0.6301, + "step": 4948 + }, + { + "epoch": 0.63, + "grad_norm": 1.2998344898223877, + "learning_rate": 3.1197011479531386e-06, + "loss": 0.7585, + "step": 4949 + }, + { + "epoch": 0.63, + "grad_norm": 1.2029203176498413, + "learning_rate": 3.1177785983547633e-06, + "loss": 0.5922, + "step": 4950 + }, + { + "epoch": 0.63, + "grad_norm": 1.2441179752349854, + "learning_rate": 3.1158563729307658e-06, + "loss": 0.6311, + "step": 4951 + }, + { + "epoch": 0.63, + "grad_norm": 1.2800577878952026, + "learning_rate": 3.113934472012212e-06, + "loss": 0.5742, + "step": 4952 + }, + { + "epoch": 0.63, + "grad_norm": 1.2300503253936768, + "learning_rate": 3.112012895930109e-06, + "loss": 0.5904, + "step": 4953 + }, + { + "epoch": 0.63, + "grad_norm": 1.1221928596496582, + "learning_rate": 3.110091645015409e-06, + "loss": 0.5441, + "step": 4954 + }, + { + "epoch": 0.63, + "grad_norm": 1.1386147737503052, + "learning_rate": 3.1081707195990115e-06, + "loss": 0.6479, + "step": 4955 + }, + { + "epoch": 0.63, + "grad_norm": 0.9600458145141602, + "learning_rate": 3.1062501200117536e-06, + "loss": 0.4795, + "step": 4956 + }, + { + "epoch": 0.64, + "grad_norm": 1.1221802234649658, + "learning_rate": 3.1043298465844207e-06, + "loss": 0.5593, + "step": 4957 + }, + { + "epoch": 0.64, + "grad_norm": 1.3925119638442993, + "learning_rate": 3.1024098996477407e-06, + "loss": 0.5899, + "step": 4958 + }, + { + "epoch": 0.64, + "grad_norm": 1.0606712102890015, + "learning_rate": 3.1004902795323867e-06, + "loss": 0.4922, + "step": 4959 + }, + { + "epoch": 0.64, + "grad_norm": 1.3476324081420898, + "learning_rate": 3.098570986568972e-06, + "loss": 0.6115, + "step": 4960 + }, + { + "epoch": 0.64, + "grad_norm": 1.3782424926757812, + "learning_rate": 3.096652021088057e-06, + "loss": 0.6623, + "step": 4961 + }, + { + "epoch": 0.64, + "grad_norm": 1.2778935432434082, + "learning_rate": 3.0947333834201443e-06, + "loss": 0.6427, + "step": 4962 + }, + { + "epoch": 0.64, + "grad_norm": 3.0849034786224365, + "learning_rate": 3.092815073895681e-06, + "loss": 0.573, + "step": 4963 + }, + { + "epoch": 0.64, + "grad_norm": 1.0979350805282593, + "learning_rate": 3.0908970928450555e-06, + "loss": 0.5269, + "step": 4964 + }, + { + "epoch": 0.64, + "grad_norm": 1.2356542348861694, + "learning_rate": 3.0889794405986024e-06, + "loss": 0.5587, + "step": 4965 + }, + { + "epoch": 0.64, + "grad_norm": 1.2701390981674194, + "learning_rate": 3.087062117486597e-06, + "loss": 0.6439, + "step": 4966 + }, + { + "epoch": 0.64, + "grad_norm": 1.3336454629898071, + "learning_rate": 3.0851451238392604e-06, + "loss": 0.5936, + "step": 4967 + }, + { + "epoch": 0.64, + "grad_norm": 1.7004276514053345, + "learning_rate": 3.0832284599867544e-06, + "loss": 0.5704, + "step": 4968 + }, + { + "epoch": 0.64, + "grad_norm": 1.0348007678985596, + "learning_rate": 3.0813121262591885e-06, + "loss": 0.5852, + "step": 4969 + }, + { + "epoch": 0.64, + "grad_norm": 1.3528045415878296, + "learning_rate": 3.0793961229866077e-06, + "loss": 0.6092, + "step": 4970 + }, + { + "epoch": 0.64, + "grad_norm": 1.1565823554992676, + "learning_rate": 3.0774804504990064e-06, + "loss": 0.6212, + "step": 4971 + }, + { + "epoch": 0.64, + "grad_norm": 1.185302734375, + "learning_rate": 3.0755651091263233e-06, + "loss": 0.5614, + "step": 4972 + }, + { + "epoch": 0.64, + "grad_norm": 1.3281807899475098, + "learning_rate": 3.073650099198433e-06, + "loss": 0.6672, + "step": 4973 + }, + { + "epoch": 0.64, + "grad_norm": 1.251400351524353, + "learning_rate": 3.07173542104516e-06, + "loss": 0.7054, + "step": 4974 + }, + { + "epoch": 0.64, + "grad_norm": 0.98326575756073, + "learning_rate": 3.069821074996266e-06, + "loss": 0.6104, + "step": 4975 + }, + { + "epoch": 0.64, + "grad_norm": 1.5410773754119873, + "learning_rate": 3.067907061381461e-06, + "loss": 0.6519, + "step": 4976 + }, + { + "epoch": 0.64, + "grad_norm": 1.4055252075195312, + "learning_rate": 3.0659933805303914e-06, + "loss": 0.607, + "step": 4977 + }, + { + "epoch": 0.64, + "grad_norm": 1.695743203163147, + "learning_rate": 3.0640800327726537e-06, + "loss": 0.6483, + "step": 4978 + }, + { + "epoch": 0.64, + "grad_norm": 1.2075164318084717, + "learning_rate": 3.06216701843778e-06, + "loss": 0.5343, + "step": 4979 + }, + { + "epoch": 0.64, + "grad_norm": 1.335493564605713, + "learning_rate": 3.060254337855251e-06, + "loss": 0.6638, + "step": 4980 + }, + { + "epoch": 0.64, + "grad_norm": 1.1143360137939453, + "learning_rate": 3.0583419913544833e-06, + "loss": 0.5355, + "step": 4981 + }, + { + "epoch": 0.64, + "grad_norm": 1.7671163082122803, + "learning_rate": 3.056429979264844e-06, + "loss": 0.5906, + "step": 4982 + }, + { + "epoch": 0.64, + "grad_norm": 1.333821415901184, + "learning_rate": 3.0545183019156345e-06, + "loss": 0.6186, + "step": 4983 + }, + { + "epoch": 0.64, + "grad_norm": 1.1221269369125366, + "learning_rate": 3.052606959636106e-06, + "loss": 0.634, + "step": 4984 + }, + { + "epoch": 0.64, + "grad_norm": 1.378235936164856, + "learning_rate": 3.0506959527554445e-06, + "loss": 0.5971, + "step": 4985 + }, + { + "epoch": 0.64, + "grad_norm": 1.267785668373108, + "learning_rate": 3.0487852816027853e-06, + "loss": 0.5958, + "step": 4986 + }, + { + "epoch": 0.64, + "grad_norm": 1.2087866067886353, + "learning_rate": 3.046874946507201e-06, + "loss": 0.6207, + "step": 4987 + }, + { + "epoch": 0.64, + "grad_norm": 1.871625542640686, + "learning_rate": 3.0449649477977073e-06, + "loss": 0.5814, + "step": 4988 + }, + { + "epoch": 0.64, + "grad_norm": 1.2489285469055176, + "learning_rate": 3.0430552858032647e-06, + "loss": 0.6081, + "step": 4989 + }, + { + "epoch": 0.64, + "grad_norm": 1.1966968774795532, + "learning_rate": 3.0411459608527727e-06, + "loss": 0.5906, + "step": 4990 + }, + { + "epoch": 0.64, + "grad_norm": 1.507742166519165, + "learning_rate": 3.039236973275075e-06, + "loss": 0.628, + "step": 4991 + }, + { + "epoch": 0.64, + "grad_norm": 1.4480714797973633, + "learning_rate": 3.037328323398953e-06, + "loss": 0.5951, + "step": 4992 + }, + { + "epoch": 0.64, + "grad_norm": 1.2698642015457153, + "learning_rate": 3.035420011553136e-06, + "loss": 0.5878, + "step": 4993 + }, + { + "epoch": 0.64, + "grad_norm": 1.0168004035949707, + "learning_rate": 3.0335120380662897e-06, + "loss": 0.6092, + "step": 4994 + }, + { + "epoch": 0.64, + "grad_norm": 1.1975743770599365, + "learning_rate": 3.0316044032670245e-06, + "loss": 0.607, + "step": 4995 + }, + { + "epoch": 0.64, + "grad_norm": 1.311187982559204, + "learning_rate": 3.0296971074838923e-06, + "loss": 0.5956, + "step": 4996 + }, + { + "epoch": 0.64, + "grad_norm": 1.199485182762146, + "learning_rate": 3.027790151045384e-06, + "loss": 0.7287, + "step": 4997 + }, + { + "epoch": 0.64, + "grad_norm": 1.2564228773117065, + "learning_rate": 3.0258835342799362e-06, + "loss": 0.6196, + "step": 4998 + }, + { + "epoch": 0.64, + "grad_norm": 1.117231011390686, + "learning_rate": 3.023977257515924e-06, + "loss": 0.5185, + "step": 4999 + }, + { + "epoch": 0.64, + "grad_norm": 1.2204550504684448, + "learning_rate": 3.022071321081666e-06, + "loss": 0.5253, + "step": 5000 + }, + { + "epoch": 0.64, + "grad_norm": 1.2541391849517822, + "learning_rate": 3.020165725305419e-06, + "loss": 0.6425, + "step": 5001 + }, + { + "epoch": 0.64, + "grad_norm": 1.467839241027832, + "learning_rate": 3.018260470515385e-06, + "loss": 0.6367, + "step": 5002 + }, + { + "epoch": 0.64, + "grad_norm": 1.1253759860992432, + "learning_rate": 3.016355557039704e-06, + "loss": 0.5926, + "step": 5003 + }, + { + "epoch": 0.64, + "grad_norm": 1.2686272859573364, + "learning_rate": 3.0144509852064597e-06, + "loss": 0.534, + "step": 5004 + }, + { + "epoch": 0.64, + "grad_norm": 1.152122139930725, + "learning_rate": 3.0125467553436737e-06, + "loss": 0.5331, + "step": 5005 + }, + { + "epoch": 0.64, + "grad_norm": 1.1917579174041748, + "learning_rate": 3.0106428677793133e-06, + "loss": 0.596, + "step": 5006 + }, + { + "epoch": 0.64, + "grad_norm": 1.1018012762069702, + "learning_rate": 3.008739322841285e-06, + "loss": 0.6628, + "step": 5007 + }, + { + "epoch": 0.64, + "grad_norm": 1.526096224784851, + "learning_rate": 3.0068361208574336e-06, + "loss": 0.604, + "step": 5008 + }, + { + "epoch": 0.64, + "grad_norm": 1.2660118341445923, + "learning_rate": 3.0049332621555483e-06, + "loss": 0.6269, + "step": 5009 + }, + { + "epoch": 0.64, + "grad_norm": 1.051699161529541, + "learning_rate": 3.003030747063357e-06, + "loss": 0.5862, + "step": 5010 + }, + { + "epoch": 0.64, + "grad_norm": 1.3319251537322998, + "learning_rate": 3.0011285759085296e-06, + "loss": 0.5735, + "step": 5011 + }, + { + "epoch": 0.64, + "grad_norm": 1.586374282836914, + "learning_rate": 2.9992267490186766e-06, + "loss": 0.6117, + "step": 5012 + }, + { + "epoch": 0.64, + "grad_norm": 1.3304932117462158, + "learning_rate": 2.9973252667213494e-06, + "loss": 0.6032, + "step": 5013 + }, + { + "epoch": 0.64, + "grad_norm": 1.2678347826004028, + "learning_rate": 2.995424129344038e-06, + "loss": 0.4959, + "step": 5014 + }, + { + "epoch": 0.64, + "grad_norm": 1.3165990114212036, + "learning_rate": 2.993523337214177e-06, + "loss": 0.6297, + "step": 5015 + }, + { + "epoch": 0.64, + "grad_norm": 1.1281373500823975, + "learning_rate": 2.9916228906591366e-06, + "loss": 0.5204, + "step": 5016 + }, + { + "epoch": 0.64, + "grad_norm": 1.138863205909729, + "learning_rate": 2.9897227900062327e-06, + "loss": 0.6399, + "step": 5017 + }, + { + "epoch": 0.64, + "grad_norm": 1.1743173599243164, + "learning_rate": 2.9878230355827166e-06, + "loss": 0.5562, + "step": 5018 + }, + { + "epoch": 0.64, + "grad_norm": 1.163933277130127, + "learning_rate": 2.985923627715785e-06, + "loss": 0.614, + "step": 5019 + }, + { + "epoch": 0.64, + "grad_norm": 1.2621997594833374, + "learning_rate": 2.9840245667325697e-06, + "loss": 0.6404, + "step": 5020 + }, + { + "epoch": 0.64, + "grad_norm": 1.3806610107421875, + "learning_rate": 2.982125852960148e-06, + "loss": 0.6053, + "step": 5021 + }, + { + "epoch": 0.64, + "grad_norm": 1.8941912651062012, + "learning_rate": 2.9802274867255306e-06, + "loss": 0.6629, + "step": 5022 + }, + { + "epoch": 0.64, + "grad_norm": 1.2895264625549316, + "learning_rate": 2.9783294683556764e-06, + "loss": 0.6593, + "step": 5023 + }, + { + "epoch": 0.64, + "grad_norm": 1.5828664302825928, + "learning_rate": 2.9764317981774804e-06, + "loss": 0.6568, + "step": 5024 + }, + { + "epoch": 0.64, + "grad_norm": 1.4575058221817017, + "learning_rate": 2.9745344765177753e-06, + "loss": 0.6443, + "step": 5025 + }, + { + "epoch": 0.64, + "grad_norm": 1.212241530418396, + "learning_rate": 2.9726375037033396e-06, + "loss": 0.5902, + "step": 5026 + }, + { + "epoch": 0.64, + "grad_norm": 1.538864016532898, + "learning_rate": 2.9707408800608837e-06, + "loss": 0.5993, + "step": 5027 + }, + { + "epoch": 0.64, + "grad_norm": 1.0669389963150024, + "learning_rate": 2.968844605917067e-06, + "loss": 0.5779, + "step": 5028 + }, + { + "epoch": 0.64, + "grad_norm": 1.2678580284118652, + "learning_rate": 2.9669486815984807e-06, + "loss": 0.6404, + "step": 5029 + }, + { + "epoch": 0.64, + "grad_norm": 1.2878882884979248, + "learning_rate": 2.965053107431662e-06, + "loss": 0.5885, + "step": 5030 + }, + { + "epoch": 0.64, + "grad_norm": 1.1472769975662231, + "learning_rate": 2.9631578837430826e-06, + "loss": 0.6865, + "step": 5031 + }, + { + "epoch": 0.64, + "grad_norm": 1.0584124326705933, + "learning_rate": 2.9612630108591576e-06, + "loss": 0.5281, + "step": 5032 + }, + { + "epoch": 0.64, + "grad_norm": 1.6062891483306885, + "learning_rate": 2.9593684891062403e-06, + "loss": 0.5883, + "step": 5033 + }, + { + "epoch": 0.64, + "grad_norm": 1.0802760124206543, + "learning_rate": 2.9574743188106235e-06, + "loss": 0.5788, + "step": 5034 + }, + { + "epoch": 0.65, + "grad_norm": 1.1777443885803223, + "learning_rate": 2.9555805002985394e-06, + "loss": 0.5501, + "step": 5035 + }, + { + "epoch": 0.65, + "grad_norm": 1.3648144006729126, + "learning_rate": 2.9536870338961597e-06, + "loss": 0.6221, + "step": 5036 + }, + { + "epoch": 0.65, + "grad_norm": 1.4176534414291382, + "learning_rate": 2.9517939199295965e-06, + "loss": 0.6233, + "step": 5037 + }, + { + "epoch": 0.65, + "grad_norm": 1.1230452060699463, + "learning_rate": 2.9499011587248972e-06, + "loss": 0.5559, + "step": 5038 + }, + { + "epoch": 0.65, + "grad_norm": 1.0202972888946533, + "learning_rate": 2.948008750608055e-06, + "loss": 0.6254, + "step": 5039 + }, + { + "epoch": 0.65, + "grad_norm": 1.4414222240447998, + "learning_rate": 2.9461166959049943e-06, + "loss": 0.5967, + "step": 5040 + }, + { + "epoch": 0.65, + "grad_norm": 1.1397916078567505, + "learning_rate": 2.9442249949415893e-06, + "loss": 0.4967, + "step": 5041 + }, + { + "epoch": 0.65, + "grad_norm": 1.192447543144226, + "learning_rate": 2.942333648043643e-06, + "loss": 0.5351, + "step": 5042 + }, + { + "epoch": 0.65, + "grad_norm": 1.623815894126892, + "learning_rate": 2.9404426555369012e-06, + "loss": 0.5818, + "step": 5043 + }, + { + "epoch": 0.65, + "grad_norm": 1.3485163450241089, + "learning_rate": 2.9385520177470517e-06, + "loss": 0.5429, + "step": 5044 + }, + { + "epoch": 0.65, + "grad_norm": 1.3247160911560059, + "learning_rate": 2.936661734999715e-06, + "loss": 0.5124, + "step": 5045 + }, + { + "epoch": 0.65, + "grad_norm": 1.489458441734314, + "learning_rate": 2.934771807620457e-06, + "loss": 0.6052, + "step": 5046 + }, + { + "epoch": 0.65, + "grad_norm": 1.4193159341812134, + "learning_rate": 2.932882235934776e-06, + "loss": 0.5938, + "step": 5047 + }, + { + "epoch": 0.65, + "grad_norm": 1.929944396018982, + "learning_rate": 2.9309930202681156e-06, + "loss": 0.5478, + "step": 5048 + }, + { + "epoch": 0.65, + "grad_norm": 1.3525385856628418, + "learning_rate": 2.9291041609458528e-06, + "loss": 0.5828, + "step": 5049 + }, + { + "epoch": 0.65, + "grad_norm": 1.367734670639038, + "learning_rate": 2.9272156582933063e-06, + "loss": 0.5198, + "step": 5050 + }, + { + "epoch": 0.65, + "grad_norm": 1.2847005128860474, + "learning_rate": 2.9253275126357313e-06, + "loss": 0.5772, + "step": 5051 + }, + { + "epoch": 0.65, + "grad_norm": 1.4280390739440918, + "learning_rate": 2.9234397242983232e-06, + "loss": 0.5525, + "step": 5052 + }, + { + "epoch": 0.65, + "grad_norm": 1.293440580368042, + "learning_rate": 2.921552293606214e-06, + "loss": 0.5723, + "step": 5053 + }, + { + "epoch": 0.65, + "grad_norm": 1.1732693910598755, + "learning_rate": 2.919665220884478e-06, + "loss": 0.6308, + "step": 5054 + }, + { + "epoch": 0.65, + "grad_norm": 1.4701364040374756, + "learning_rate": 2.917778506458121e-06, + "loss": 0.5309, + "step": 5055 + }, + { + "epoch": 0.65, + "grad_norm": 1.3998725414276123, + "learning_rate": 2.9158921506520943e-06, + "loss": 0.5675, + "step": 5056 + }, + { + "epoch": 0.65, + "grad_norm": 1.3049033880233765, + "learning_rate": 2.9140061537912835e-06, + "loss": 0.6115, + "step": 5057 + }, + { + "epoch": 0.65, + "grad_norm": 1.081899881362915, + "learning_rate": 2.9121205162005134e-06, + "loss": 0.5292, + "step": 5058 + }, + { + "epoch": 0.65, + "grad_norm": 1.247861623764038, + "learning_rate": 2.9102352382045464e-06, + "loss": 0.5237, + "step": 5059 + }, + { + "epoch": 0.65, + "grad_norm": 1.3318337202072144, + "learning_rate": 2.908350320128086e-06, + "loss": 0.6684, + "step": 5060 + }, + { + "epoch": 0.65, + "grad_norm": 1.313628077507019, + "learning_rate": 2.906465762295766e-06, + "loss": 0.6259, + "step": 5061 + }, + { + "epoch": 0.65, + "grad_norm": 1.4944661855697632, + "learning_rate": 2.904581565032166e-06, + "loss": 0.5594, + "step": 5062 + }, + { + "epoch": 0.65, + "grad_norm": 1.2283939123153687, + "learning_rate": 2.902697728661801e-06, + "loss": 0.5856, + "step": 5063 + }, + { + "epoch": 0.65, + "grad_norm": 1.3918577432632446, + "learning_rate": 2.9008142535091246e-06, + "loss": 0.5818, + "step": 5064 + }, + { + "epoch": 0.65, + "grad_norm": 1.163896918296814, + "learning_rate": 2.898931139898523e-06, + "loss": 0.6553, + "step": 5065 + }, + { + "epoch": 0.65, + "grad_norm": 1.1158233880996704, + "learning_rate": 2.897048388154328e-06, + "loss": 0.5073, + "step": 5066 + }, + { + "epoch": 0.65, + "grad_norm": 1.0618435144424438, + "learning_rate": 2.895165998600803e-06, + "loss": 0.5781, + "step": 5067 + }, + { + "epoch": 0.65, + "grad_norm": 1.5787274837493896, + "learning_rate": 2.893283971562154e-06, + "loss": 0.597, + "step": 5068 + }, + { + "epoch": 0.65, + "grad_norm": 1.4554435014724731, + "learning_rate": 2.891402307362519e-06, + "loss": 0.5957, + "step": 5069 + }, + { + "epoch": 0.65, + "grad_norm": 1.2838584184646606, + "learning_rate": 2.8895210063259775e-06, + "loss": 0.565, + "step": 5070 + }, + { + "epoch": 0.65, + "grad_norm": 1.2869141101837158, + "learning_rate": 2.887640068776546e-06, + "loss": 0.5993, + "step": 5071 + }, + { + "epoch": 0.65, + "grad_norm": 1.1139177083969116, + "learning_rate": 2.885759495038179e-06, + "loss": 0.5741, + "step": 5072 + }, + { + "epoch": 0.65, + "grad_norm": 2.5964291095733643, + "learning_rate": 2.883879285434763e-06, + "loss": 0.6509, + "step": 5073 + }, + { + "epoch": 0.65, + "grad_norm": 0.9919441938400269, + "learning_rate": 2.8819994402901276e-06, + "loss": 0.4702, + "step": 5074 + }, + { + "epoch": 0.65, + "grad_norm": 1.2516149282455444, + "learning_rate": 2.8801199599280417e-06, + "loss": 0.6224, + "step": 5075 + }, + { + "epoch": 0.65, + "grad_norm": 1.0911529064178467, + "learning_rate": 2.878240844672202e-06, + "loss": 0.5022, + "step": 5076 + }, + { + "epoch": 0.65, + "grad_norm": 1.3920502662658691, + "learning_rate": 2.876362094846251e-06, + "loss": 0.6245, + "step": 5077 + }, + { + "epoch": 0.65, + "grad_norm": 1.443747878074646, + "learning_rate": 2.874483710773765e-06, + "loss": 0.5352, + "step": 5078 + }, + { + "epoch": 0.65, + "grad_norm": 1.816619634628296, + "learning_rate": 2.8726056927782587e-06, + "loss": 0.6492, + "step": 5079 + }, + { + "epoch": 0.65, + "grad_norm": 1.262075662612915, + "learning_rate": 2.8707280411831796e-06, + "loss": 0.5654, + "step": 5080 + }, + { + "epoch": 0.65, + "grad_norm": 1.3001986742019653, + "learning_rate": 2.868850756311915e-06, + "loss": 0.5528, + "step": 5081 + }, + { + "epoch": 0.65, + "grad_norm": 1.3010063171386719, + "learning_rate": 2.8669738384877915e-06, + "loss": 0.5112, + "step": 5082 + }, + { + "epoch": 0.65, + "grad_norm": 1.350829839706421, + "learning_rate": 2.8650972880340704e-06, + "loss": 0.6077, + "step": 5083 + }, + { + "epoch": 0.65, + "grad_norm": 1.1886601448059082, + "learning_rate": 2.8632211052739463e-06, + "loss": 0.6004, + "step": 5084 + }, + { + "epoch": 0.65, + "grad_norm": 2.6389498710632324, + "learning_rate": 2.861345290530555e-06, + "loss": 0.6008, + "step": 5085 + }, + { + "epoch": 0.65, + "grad_norm": 2.1080713272094727, + "learning_rate": 2.8594698441269696e-06, + "loss": 0.5678, + "step": 5086 + }, + { + "epoch": 0.65, + "grad_norm": 1.1639949083328247, + "learning_rate": 2.8575947663861935e-06, + "loss": 0.6014, + "step": 5087 + }, + { + "epoch": 0.65, + "grad_norm": 1.455990195274353, + "learning_rate": 2.855720057631173e-06, + "loss": 0.5077, + "step": 5088 + }, + { + "epoch": 0.65, + "grad_norm": 1.3436378240585327, + "learning_rate": 2.853845718184789e-06, + "loss": 0.5635, + "step": 5089 + }, + { + "epoch": 0.65, + "grad_norm": 1.1918082237243652, + "learning_rate": 2.851971748369859e-06, + "loss": 0.5182, + "step": 5090 + }, + { + "epoch": 0.65, + "grad_norm": 1.5144091844558716, + "learning_rate": 2.8500981485091313e-06, + "loss": 0.69, + "step": 5091 + }, + { + "epoch": 0.65, + "grad_norm": 1.447807788848877, + "learning_rate": 2.848224918925301e-06, + "loss": 0.6154, + "step": 5092 + }, + { + "epoch": 0.65, + "grad_norm": 1.535978078842163, + "learning_rate": 2.8463520599409945e-06, + "loss": 0.5446, + "step": 5093 + }, + { + "epoch": 0.65, + "grad_norm": 1.2916719913482666, + "learning_rate": 2.844479571878769e-06, + "loss": 0.5821, + "step": 5094 + }, + { + "epoch": 0.65, + "grad_norm": 1.3413225412368774, + "learning_rate": 2.8426074550611237e-06, + "loss": 0.5991, + "step": 5095 + }, + { + "epoch": 0.65, + "grad_norm": 1.7014421224594116, + "learning_rate": 2.840735709810495e-06, + "loss": 0.6615, + "step": 5096 + }, + { + "epoch": 0.65, + "grad_norm": 1.217027187347412, + "learning_rate": 2.838864336449253e-06, + "loss": 0.7111, + "step": 5097 + }, + { + "epoch": 0.65, + "grad_norm": 1.2105580568313599, + "learning_rate": 2.8369933352997003e-06, + "loss": 0.552, + "step": 5098 + }, + { + "epoch": 0.65, + "grad_norm": 1.383817434310913, + "learning_rate": 2.8351227066840805e-06, + "loss": 0.6348, + "step": 5099 + }, + { + "epoch": 0.65, + "grad_norm": 1.2009333372116089, + "learning_rate": 2.8332524509245718e-06, + "loss": 0.539, + "step": 5100 + }, + { + "epoch": 0.65, + "grad_norm": 1.331572413444519, + "learning_rate": 2.8313825683432906e-06, + "loss": 0.5791, + "step": 5101 + }, + { + "epoch": 0.65, + "grad_norm": 1.6446709632873535, + "learning_rate": 2.8295130592622797e-06, + "loss": 0.6404, + "step": 5102 + }, + { + "epoch": 0.65, + "grad_norm": 1.5512125492095947, + "learning_rate": 2.8276439240035287e-06, + "loss": 0.542, + "step": 5103 + }, + { + "epoch": 0.65, + "grad_norm": 1.3419227600097656, + "learning_rate": 2.8257751628889564e-06, + "loss": 0.5675, + "step": 5104 + }, + { + "epoch": 0.65, + "grad_norm": 1.3091492652893066, + "learning_rate": 2.8239067762404216e-06, + "loss": 0.6319, + "step": 5105 + }, + { + "epoch": 0.65, + "grad_norm": 1.235048770904541, + "learning_rate": 2.822038764379712e-06, + "loss": 0.6524, + "step": 5106 + }, + { + "epoch": 0.65, + "grad_norm": 1.292203664779663, + "learning_rate": 2.820171127628557e-06, + "loss": 0.5984, + "step": 5107 + }, + { + "epoch": 0.65, + "grad_norm": 3.168729543685913, + "learning_rate": 2.818303866308618e-06, + "loss": 0.6435, + "step": 5108 + }, + { + "epoch": 0.65, + "grad_norm": 1.1043180227279663, + "learning_rate": 2.8164369807414936e-06, + "loss": 0.6302, + "step": 5109 + }, + { + "epoch": 0.65, + "grad_norm": 1.3122977018356323, + "learning_rate": 2.8145704712487167e-06, + "loss": 0.6408, + "step": 5110 + }, + { + "epoch": 0.65, + "grad_norm": 1.1558114290237427, + "learning_rate": 2.8127043381517553e-06, + "loss": 0.5507, + "step": 5111 + }, + { + "epoch": 0.65, + "grad_norm": 1.5054841041564941, + "learning_rate": 2.810838581772015e-06, + "loss": 0.6062, + "step": 5112 + }, + { + "epoch": 0.66, + "grad_norm": 1.2152769565582275, + "learning_rate": 2.8089732024308316e-06, + "loss": 0.623, + "step": 5113 + }, + { + "epoch": 0.66, + "grad_norm": 1.117659091949463, + "learning_rate": 2.807108200449479e-06, + "loss": 0.5331, + "step": 5114 + }, + { + "epoch": 0.66, + "grad_norm": 1.4485636949539185, + "learning_rate": 2.805243576149167e-06, + "loss": 0.6216, + "step": 5115 + }, + { + "epoch": 0.66, + "grad_norm": 1.1245951652526855, + "learning_rate": 2.8033793298510415e-06, + "loss": 0.6768, + "step": 5116 + }, + { + "epoch": 0.66, + "grad_norm": 1.2359367609024048, + "learning_rate": 2.8015154618761754e-06, + "loss": 0.611, + "step": 5117 + }, + { + "epoch": 0.66, + "grad_norm": 1.2246320247650146, + "learning_rate": 2.7996519725455857e-06, + "loss": 0.5716, + "step": 5118 + }, + { + "epoch": 0.66, + "grad_norm": 1.1994571685791016, + "learning_rate": 2.7977888621802196e-06, + "loss": 0.5599, + "step": 5119 + }, + { + "epoch": 0.66, + "grad_norm": 1.1807100772857666, + "learning_rate": 2.7959261311009623e-06, + "loss": 0.583, + "step": 5120 + }, + { + "epoch": 0.66, + "grad_norm": 1.2274185419082642, + "learning_rate": 2.794063779628628e-06, + "loss": 0.5591, + "step": 5121 + }, + { + "epoch": 0.66, + "grad_norm": 1.1783159971237183, + "learning_rate": 2.79220180808397e-06, + "loss": 0.5576, + "step": 5122 + }, + { + "epoch": 0.66, + "grad_norm": 1.355054497718811, + "learning_rate": 2.790340216787676e-06, + "loss": 0.6449, + "step": 5123 + }, + { + "epoch": 0.66, + "grad_norm": 1.0858644247055054, + "learning_rate": 2.788479006060368e-06, + "loss": 0.7109, + "step": 5124 + }, + { + "epoch": 0.66, + "grad_norm": 1.1023110151290894, + "learning_rate": 2.7866181762225964e-06, + "loss": 0.5975, + "step": 5125 + }, + { + "epoch": 0.66, + "grad_norm": 1.26510751247406, + "learning_rate": 2.7847577275948573e-06, + "loss": 0.6797, + "step": 5126 + }, + { + "epoch": 0.66, + "grad_norm": 1.3188729286193848, + "learning_rate": 2.7828976604975756e-06, + "loss": 0.6283, + "step": 5127 + }, + { + "epoch": 0.66, + "grad_norm": 1.3665924072265625, + "learning_rate": 2.7810379752511045e-06, + "loss": 0.539, + "step": 5128 + }, + { + "epoch": 0.66, + "grad_norm": 1.152752161026001, + "learning_rate": 2.779178672175741e-06, + "loss": 0.5775, + "step": 5129 + }, + { + "epoch": 0.66, + "grad_norm": 1.2595551013946533, + "learning_rate": 2.777319751591711e-06, + "loss": 0.6191, + "step": 5130 + }, + { + "epoch": 0.66, + "grad_norm": 1.1859368085861206, + "learning_rate": 2.7754612138191784e-06, + "loss": 0.5934, + "step": 5131 + }, + { + "epoch": 0.66, + "grad_norm": 1.0668327808380127, + "learning_rate": 2.7736030591782337e-06, + "loss": 0.7114, + "step": 5132 + }, + { + "epoch": 0.66, + "grad_norm": 1.3377395868301392, + "learning_rate": 2.7717452879889094e-06, + "loss": 0.6773, + "step": 5133 + }, + { + "epoch": 0.66, + "grad_norm": 1.3250497579574585, + "learning_rate": 2.7698879005711684e-06, + "loss": 0.5363, + "step": 5134 + }, + { + "epoch": 0.66, + "grad_norm": 1.2125672101974487, + "learning_rate": 2.768030897244909e-06, + "loss": 0.5669, + "step": 5135 + }, + { + "epoch": 0.66, + "grad_norm": 1.2120472192764282, + "learning_rate": 2.76617427832996e-06, + "loss": 0.5805, + "step": 5136 + }, + { + "epoch": 0.66, + "grad_norm": 1.2930200099945068, + "learning_rate": 2.764318044146087e-06, + "loss": 0.6083, + "step": 5137 + }, + { + "epoch": 0.66, + "grad_norm": 1.3229888677597046, + "learning_rate": 2.762462195012991e-06, + "loss": 0.5454, + "step": 5138 + }, + { + "epoch": 0.66, + "grad_norm": 1.3981189727783203, + "learning_rate": 2.7606067312503006e-06, + "loss": 0.6181, + "step": 5139 + }, + { + "epoch": 0.66, + "grad_norm": 1.3310023546218872, + "learning_rate": 2.7587516531775826e-06, + "loss": 0.5589, + "step": 5140 + }, + { + "epoch": 0.66, + "grad_norm": 1.1334054470062256, + "learning_rate": 2.7568969611143377e-06, + "loss": 0.5911, + "step": 5141 + }, + { + "epoch": 0.66, + "grad_norm": 1.3404936790466309, + "learning_rate": 2.755042655379998e-06, + "loss": 0.624, + "step": 5142 + }, + { + "epoch": 0.66, + "grad_norm": 1.3292677402496338, + "learning_rate": 2.7531887362939314e-06, + "loss": 0.598, + "step": 5143 + }, + { + "epoch": 0.66, + "grad_norm": 1.5356954336166382, + "learning_rate": 2.751335204175436e-06, + "loss": 0.6055, + "step": 5144 + }, + { + "epoch": 0.66, + "grad_norm": 1.472981333732605, + "learning_rate": 2.7494820593437483e-06, + "loss": 0.6197, + "step": 5145 + }, + { + "epoch": 0.66, + "grad_norm": 1.6259320974349976, + "learning_rate": 2.74762930211803e-06, + "loss": 0.5957, + "step": 5146 + }, + { + "epoch": 0.66, + "grad_norm": 1.2277649641036987, + "learning_rate": 2.745776932817384e-06, + "loss": 0.5525, + "step": 5147 + }, + { + "epoch": 0.66, + "grad_norm": 1.352315068244934, + "learning_rate": 2.743924951760842e-06, + "loss": 0.5822, + "step": 5148 + }, + { + "epoch": 0.66, + "grad_norm": 1.4033784866333008, + "learning_rate": 2.7420733592673727e-06, + "loss": 0.5787, + "step": 5149 + }, + { + "epoch": 0.66, + "grad_norm": 1.3330798149108887, + "learning_rate": 2.740222155655871e-06, + "loss": 0.5817, + "step": 5150 + }, + { + "epoch": 0.66, + "grad_norm": 1.3742144107818604, + "learning_rate": 2.7383713412451716e-06, + "loss": 0.5179, + "step": 5151 + }, + { + "epoch": 0.66, + "grad_norm": 1.388723373413086, + "learning_rate": 2.736520916354039e-06, + "loss": 0.4809, + "step": 5152 + }, + { + "epoch": 0.66, + "grad_norm": 1.3881044387817383, + "learning_rate": 2.734670881301174e-06, + "loss": 0.5648, + "step": 5153 + }, + { + "epoch": 0.66, + "grad_norm": 1.7049732208251953, + "learning_rate": 2.732821236405203e-06, + "loss": 0.5736, + "step": 5154 + }, + { + "epoch": 0.66, + "grad_norm": 1.0168077945709229, + "learning_rate": 2.730971981984692e-06, + "loss": 0.6677, + "step": 5155 + }, + { + "epoch": 0.66, + "grad_norm": 1.2047303915023804, + "learning_rate": 2.729123118358137e-06, + "loss": 0.6059, + "step": 5156 + }, + { + "epoch": 0.66, + "grad_norm": 1.1460449695587158, + "learning_rate": 2.7272746458439705e-06, + "loss": 0.6915, + "step": 5157 + }, + { + "epoch": 0.66, + "grad_norm": 1.4450008869171143, + "learning_rate": 2.7254265647605483e-06, + "loss": 0.6629, + "step": 5158 + }, + { + "epoch": 0.66, + "grad_norm": 1.5762354135513306, + "learning_rate": 2.723578875426166e-06, + "loss": 0.6685, + "step": 5159 + }, + { + "epoch": 0.66, + "grad_norm": 1.3964108228683472, + "learning_rate": 2.721731578159057e-06, + "loss": 0.6259, + "step": 5160 + }, + { + "epoch": 0.66, + "grad_norm": 1.066676139831543, + "learning_rate": 2.7198846732773743e-06, + "loss": 0.7189, + "step": 5161 + }, + { + "epoch": 0.66, + "grad_norm": 1.1809970140457153, + "learning_rate": 2.718038161099211e-06, + "loss": 0.6223, + "step": 5162 + }, + { + "epoch": 0.66, + "grad_norm": 1.1365050077438354, + "learning_rate": 2.716192041942592e-06, + "loss": 0.5089, + "step": 5163 + }, + { + "epoch": 0.66, + "grad_norm": 1.1270561218261719, + "learning_rate": 2.7143463161254755e-06, + "loss": 0.555, + "step": 5164 + }, + { + "epoch": 0.66, + "grad_norm": 3.720628261566162, + "learning_rate": 2.712500983965747e-06, + "loss": 0.6347, + "step": 5165 + }, + { + "epoch": 0.66, + "grad_norm": 1.4290040731430054, + "learning_rate": 2.710656045781228e-06, + "loss": 0.5405, + "step": 5166 + }, + { + "epoch": 0.66, + "grad_norm": 2.559084415435791, + "learning_rate": 2.7088115018896725e-06, + "loss": 0.6434, + "step": 5167 + }, + { + "epoch": 0.66, + "grad_norm": 1.1094157695770264, + "learning_rate": 2.706967352608768e-06, + "loss": 0.5956, + "step": 5168 + }, + { + "epoch": 0.66, + "grad_norm": 1.2786990404129028, + "learning_rate": 2.7051235982561275e-06, + "loss": 0.515, + "step": 5169 + }, + { + "epoch": 0.66, + "grad_norm": 1.1976306438446045, + "learning_rate": 2.703280239149302e-06, + "loss": 0.5842, + "step": 5170 + }, + { + "epoch": 0.66, + "grad_norm": 1.3613533973693848, + "learning_rate": 2.701437275605773e-06, + "loss": 0.6508, + "step": 5171 + }, + { + "epoch": 0.66, + "grad_norm": 1.403113842010498, + "learning_rate": 2.699594707942955e-06, + "loss": 0.6158, + "step": 5172 + }, + { + "epoch": 0.66, + "grad_norm": 1.390093445777893, + "learning_rate": 2.6977525364781887e-06, + "loss": 0.6485, + "step": 5173 + }, + { + "epoch": 0.66, + "grad_norm": 1.1422367095947266, + "learning_rate": 2.695910761528754e-06, + "loss": 0.6894, + "step": 5174 + }, + { + "epoch": 0.66, + "grad_norm": 1.3628041744232178, + "learning_rate": 2.694069383411857e-06, + "loss": 0.5549, + "step": 5175 + }, + { + "epoch": 0.66, + "grad_norm": 1.215430498123169, + "learning_rate": 2.692228402444642e-06, + "loss": 0.5934, + "step": 5176 + }, + { + "epoch": 0.66, + "grad_norm": 1.1754097938537598, + "learning_rate": 2.6903878189441734e-06, + "loss": 0.5572, + "step": 5177 + }, + { + "epoch": 0.66, + "grad_norm": 1.1725589036941528, + "learning_rate": 2.6885476332274598e-06, + "loss": 0.608, + "step": 5178 + }, + { + "epoch": 0.66, + "grad_norm": 1.1601642370224, + "learning_rate": 2.6867078456114367e-06, + "loss": 0.6104, + "step": 5179 + }, + { + "epoch": 0.66, + "grad_norm": 1.180969476699829, + "learning_rate": 2.6848684564129657e-06, + "loss": 0.5651, + "step": 5180 + }, + { + "epoch": 0.66, + "grad_norm": 1.457452654838562, + "learning_rate": 2.683029465948846e-06, + "loss": 0.6624, + "step": 5181 + }, + { + "epoch": 0.66, + "grad_norm": 1.2275680303573608, + "learning_rate": 2.6811908745358068e-06, + "loss": 0.7376, + "step": 5182 + }, + { + "epoch": 0.66, + "grad_norm": 1.2394523620605469, + "learning_rate": 2.6793526824905102e-06, + "loss": 0.6944, + "step": 5183 + }, + { + "epoch": 0.66, + "grad_norm": 1.4745672941207886, + "learning_rate": 2.677514890129543e-06, + "loss": 0.5372, + "step": 5184 + }, + { + "epoch": 0.66, + "grad_norm": 1.3252058029174805, + "learning_rate": 2.6756774977694295e-06, + "loss": 0.6018, + "step": 5185 + }, + { + "epoch": 0.66, + "grad_norm": 1.290786623954773, + "learning_rate": 2.6738405057266255e-06, + "loss": 0.613, + "step": 5186 + }, + { + "epoch": 0.66, + "grad_norm": 1.545142650604248, + "learning_rate": 2.6720039143175116e-06, + "loss": 0.6156, + "step": 5187 + }, + { + "epoch": 0.66, + "grad_norm": 1.6830646991729736, + "learning_rate": 2.6701677238584046e-06, + "loss": 0.5804, + "step": 5188 + }, + { + "epoch": 0.66, + "grad_norm": 1.2544286251068115, + "learning_rate": 2.6683319346655523e-06, + "loss": 0.5945, + "step": 5189 + }, + { + "epoch": 0.66, + "grad_norm": 1.4033396244049072, + "learning_rate": 2.666496547055133e-06, + "loss": 0.5992, + "step": 5190 + }, + { + "epoch": 0.67, + "grad_norm": 1.134125828742981, + "learning_rate": 2.6646615613432507e-06, + "loss": 0.6052, + "step": 5191 + }, + { + "epoch": 0.67, + "grad_norm": 1.1849713325500488, + "learning_rate": 2.6628269778459475e-06, + "loss": 0.6269, + "step": 5192 + }, + { + "epoch": 0.67, + "grad_norm": 1.2358061075210571, + "learning_rate": 2.660992796879193e-06, + "loss": 0.6058, + "step": 5193 + }, + { + "epoch": 0.67, + "grad_norm": 1.204371452331543, + "learning_rate": 2.659159018758886e-06, + "loss": 0.5892, + "step": 5194 + }, + { + "epoch": 0.67, + "grad_norm": 1.3742873668670654, + "learning_rate": 2.6573256438008594e-06, + "loss": 0.5877, + "step": 5195 + }, + { + "epoch": 0.67, + "grad_norm": 1.4056979417800903, + "learning_rate": 2.655492672320874e-06, + "loss": 0.6435, + "step": 5196 + }, + { + "epoch": 0.67, + "grad_norm": 1.357175350189209, + "learning_rate": 2.653660104634624e-06, + "loss": 0.6442, + "step": 5197 + }, + { + "epoch": 0.67, + "grad_norm": 1.4769208431243896, + "learning_rate": 2.6518279410577276e-06, + "loss": 0.5422, + "step": 5198 + }, + { + "epoch": 0.67, + "grad_norm": 1.2639411687850952, + "learning_rate": 2.6499961819057396e-06, + "loss": 0.6191, + "step": 5199 + }, + { + "epoch": 0.67, + "grad_norm": 1.4961445331573486, + "learning_rate": 2.648164827494144e-06, + "loss": 0.6537, + "step": 5200 + }, + { + "epoch": 0.67, + "grad_norm": 1.1992703676223755, + "learning_rate": 2.6463338781383563e-06, + "loss": 0.5623, + "step": 5201 + }, + { + "epoch": 0.67, + "grad_norm": 1.429692029953003, + "learning_rate": 2.6445033341537164e-06, + "loss": 0.5856, + "step": 5202 + }, + { + "epoch": 0.67, + "grad_norm": 1.4961892366409302, + "learning_rate": 2.642673195855499e-06, + "loss": 0.6466, + "step": 5203 + }, + { + "epoch": 0.67, + "grad_norm": 1.2756520509719849, + "learning_rate": 2.6408434635589096e-06, + "loss": 0.5343, + "step": 5204 + }, + { + "epoch": 0.67, + "grad_norm": 1.2954007387161255, + "learning_rate": 2.6390141375790834e-06, + "loss": 0.6814, + "step": 5205 + }, + { + "epoch": 0.67, + "grad_norm": 1.1817203760147095, + "learning_rate": 2.637185218231082e-06, + "loss": 0.5147, + "step": 5206 + }, + { + "epoch": 0.67, + "grad_norm": 1.356261134147644, + "learning_rate": 2.6353567058299006e-06, + "loss": 0.6249, + "step": 5207 + }, + { + "epoch": 0.67, + "grad_norm": 1.4610151052474976, + "learning_rate": 2.633528600690463e-06, + "loss": 0.6509, + "step": 5208 + }, + { + "epoch": 0.67, + "grad_norm": 1.2745957374572754, + "learning_rate": 2.6317009031276264e-06, + "loss": 0.7069, + "step": 5209 + }, + { + "epoch": 0.67, + "grad_norm": 1.1911793947219849, + "learning_rate": 2.6298736134561686e-06, + "loss": 0.5957, + "step": 5210 + }, + { + "epoch": 0.67, + "grad_norm": 1.2115494012832642, + "learning_rate": 2.6280467319908052e-06, + "loss": 0.4903, + "step": 5211 + }, + { + "epoch": 0.67, + "grad_norm": 1.1515758037567139, + "learning_rate": 2.6262202590461843e-06, + "loss": 0.6614, + "step": 5212 + }, + { + "epoch": 0.67, + "grad_norm": 1.148162841796875, + "learning_rate": 2.6243941949368728e-06, + "loss": 0.6236, + "step": 5213 + }, + { + "epoch": 0.67, + "grad_norm": 1.7739685773849487, + "learning_rate": 2.622568539977375e-06, + "loss": 0.6557, + "step": 5214 + }, + { + "epoch": 0.67, + "grad_norm": 1.455483078956604, + "learning_rate": 2.620743294482123e-06, + "loss": 0.6547, + "step": 5215 + }, + { + "epoch": 0.67, + "grad_norm": 1.486600399017334, + "learning_rate": 2.6189184587654813e-06, + "loss": 0.6379, + "step": 5216 + }, + { + "epoch": 0.67, + "grad_norm": 1.2012253999710083, + "learning_rate": 2.617094033141735e-06, + "loss": 0.546, + "step": 5217 + }, + { + "epoch": 0.67, + "grad_norm": 1.324657917022705, + "learning_rate": 2.615270017925107e-06, + "loss": 0.6812, + "step": 5218 + }, + { + "epoch": 0.67, + "grad_norm": 2.661184310913086, + "learning_rate": 2.6134464134297476e-06, + "loss": 0.6015, + "step": 5219 + }, + { + "epoch": 0.67, + "grad_norm": 1.1693406105041504, + "learning_rate": 2.611623219969737e-06, + "loss": 0.5996, + "step": 5220 + }, + { + "epoch": 0.67, + "grad_norm": 1.1396194696426392, + "learning_rate": 2.609800437859078e-06, + "loss": 0.6311, + "step": 5221 + }, + { + "epoch": 0.67, + "grad_norm": 1.285706877708435, + "learning_rate": 2.607978067411712e-06, + "loss": 0.6305, + "step": 5222 + }, + { + "epoch": 0.67, + "grad_norm": 1.161902666091919, + "learning_rate": 2.606156108941504e-06, + "loss": 0.6345, + "step": 5223 + }, + { + "epoch": 0.67, + "grad_norm": 1.4399522542953491, + "learning_rate": 2.6043345627622513e-06, + "loss": 0.5897, + "step": 5224 + }, + { + "epoch": 0.67, + "grad_norm": 1.349932312965393, + "learning_rate": 2.6025134291876745e-06, + "loss": 0.6166, + "step": 5225 + }, + { + "epoch": 0.67, + "grad_norm": 1.8268938064575195, + "learning_rate": 2.6006927085314295e-06, + "loss": 0.6521, + "step": 5226 + }, + { + "epoch": 0.67, + "grad_norm": 1.7306489944458008, + "learning_rate": 2.5988724011070967e-06, + "loss": 0.5793, + "step": 5227 + }, + { + "epoch": 0.67, + "grad_norm": 1.346104621887207, + "learning_rate": 2.597052507228189e-06, + "loss": 0.596, + "step": 5228 + }, + { + "epoch": 0.67, + "grad_norm": 1.2005935907363892, + "learning_rate": 2.5952330272081446e-06, + "loss": 0.6136, + "step": 5229 + }, + { + "epoch": 0.67, + "grad_norm": 1.2340837717056274, + "learning_rate": 2.5934139613603326e-06, + "loss": 0.5841, + "step": 5230 + }, + { + "epoch": 0.67, + "grad_norm": 1.3165960311889648, + "learning_rate": 2.591595309998052e-06, + "loss": 0.59, + "step": 5231 + }, + { + "epoch": 0.67, + "grad_norm": 1.0761138200759888, + "learning_rate": 2.5897770734345253e-06, + "loss": 0.5112, + "step": 5232 + }, + { + "epoch": 0.67, + "grad_norm": 1.1540740728378296, + "learning_rate": 2.5879592519829065e-06, + "loss": 0.5531, + "step": 5233 + }, + { + "epoch": 0.67, + "grad_norm": 1.2632431983947754, + "learning_rate": 2.586141845956284e-06, + "loss": 0.5473, + "step": 5234 + }, + { + "epoch": 0.67, + "grad_norm": 1.3133299350738525, + "learning_rate": 2.5843248556676615e-06, + "loss": 0.5852, + "step": 5235 + }, + { + "epoch": 0.67, + "grad_norm": 1.290602207183838, + "learning_rate": 2.582508281429983e-06, + "loss": 0.5788, + "step": 5236 + }, + { + "epoch": 0.67, + "grad_norm": 1.1755783557891846, + "learning_rate": 2.5806921235561157e-06, + "loss": 0.528, + "step": 5237 + }, + { + "epoch": 0.67, + "grad_norm": 1.6222681999206543, + "learning_rate": 2.578876382358857e-06, + "loss": 0.6104, + "step": 5238 + }, + { + "epoch": 0.67, + "grad_norm": 1.2842351198196411, + "learning_rate": 2.5770610581509292e-06, + "loss": 0.528, + "step": 5239 + }, + { + "epoch": 0.67, + "grad_norm": 1.1351863145828247, + "learning_rate": 2.5752461512449854e-06, + "loss": 0.6943, + "step": 5240 + }, + { + "epoch": 0.67, + "grad_norm": 2.076362133026123, + "learning_rate": 2.5734316619536072e-06, + "loss": 0.611, + "step": 5241 + }, + { + "epoch": 0.67, + "grad_norm": 1.2188138961791992, + "learning_rate": 2.571617590589306e-06, + "loss": 0.6463, + "step": 5242 + }, + { + "epoch": 0.67, + "grad_norm": 1.3224034309387207, + "learning_rate": 2.5698039374645134e-06, + "loss": 0.6016, + "step": 5243 + }, + { + "epoch": 0.67, + "grad_norm": 1.2040905952453613, + "learning_rate": 2.5679907028915972e-06, + "loss": 0.5456, + "step": 5244 + }, + { + "epoch": 0.67, + "grad_norm": 1.2921534776687622, + "learning_rate": 2.5661778871828503e-06, + "loss": 0.6099, + "step": 5245 + }, + { + "epoch": 0.67, + "grad_norm": 1.2232110500335693, + "learning_rate": 2.564365490650493e-06, + "loss": 0.6297, + "step": 5246 + }, + { + "epoch": 0.67, + "grad_norm": 1.3627408742904663, + "learning_rate": 2.562553513606674e-06, + "loss": 0.5766, + "step": 5247 + }, + { + "epoch": 0.67, + "grad_norm": 1.229413390159607, + "learning_rate": 2.5607419563634682e-06, + "loss": 0.5605, + "step": 5248 + }, + { + "epoch": 0.67, + "grad_norm": 1.2325100898742676, + "learning_rate": 2.558930819232883e-06, + "loss": 0.6456, + "step": 5249 + }, + { + "epoch": 0.67, + "grad_norm": 1.1989376544952393, + "learning_rate": 2.5571201025268468e-06, + "loss": 0.641, + "step": 5250 + }, + { + "epoch": 0.67, + "grad_norm": 1.2480047941207886, + "learning_rate": 2.5553098065572186e-06, + "loss": 0.5573, + "step": 5251 + }, + { + "epoch": 0.67, + "grad_norm": 1.2484368085861206, + "learning_rate": 2.553499931635786e-06, + "loss": 0.6038, + "step": 5252 + }, + { + "epoch": 0.67, + "grad_norm": 1.4819079637527466, + "learning_rate": 2.5516904780742658e-06, + "loss": 0.6092, + "step": 5253 + }, + { + "epoch": 0.67, + "grad_norm": 1.2783153057098389, + "learning_rate": 2.5498814461842942e-06, + "loss": 0.6426, + "step": 5254 + }, + { + "epoch": 0.67, + "grad_norm": 1.3205689191818237, + "learning_rate": 2.548072836277443e-06, + "loss": 0.6066, + "step": 5255 + }, + { + "epoch": 0.67, + "grad_norm": 1.2751951217651367, + "learning_rate": 2.5462646486652094e-06, + "loss": 0.5752, + "step": 5256 + }, + { + "epoch": 0.67, + "grad_norm": 1.27857506275177, + "learning_rate": 2.5444568836590174e-06, + "loss": 0.667, + "step": 5257 + }, + { + "epoch": 0.67, + "grad_norm": 1.5317552089691162, + "learning_rate": 2.5426495415702146e-06, + "loss": 0.6109, + "step": 5258 + }, + { + "epoch": 0.67, + "grad_norm": 1.290687918663025, + "learning_rate": 2.54084262271008e-06, + "loss": 0.6179, + "step": 5259 + }, + { + "epoch": 0.67, + "grad_norm": 1.1564019918441772, + "learning_rate": 2.5390361273898207e-06, + "loss": 0.5787, + "step": 5260 + }, + { + "epoch": 0.67, + "grad_norm": 1.4289751052856445, + "learning_rate": 2.537230055920569e-06, + "loss": 0.5601, + "step": 5261 + }, + { + "epoch": 0.67, + "grad_norm": 1.3122379779815674, + "learning_rate": 2.5354244086133784e-06, + "loss": 0.6218, + "step": 5262 + }, + { + "epoch": 0.67, + "grad_norm": 1.3078715801239014, + "learning_rate": 2.533619185779241e-06, + "loss": 0.5187, + "step": 5263 + }, + { + "epoch": 0.67, + "grad_norm": 1.357385277748108, + "learning_rate": 2.531814387729069e-06, + "loss": 0.648, + "step": 5264 + }, + { + "epoch": 0.67, + "grad_norm": 1.2903344631195068, + "learning_rate": 2.5300100147737007e-06, + "loss": 0.5953, + "step": 5265 + }, + { + "epoch": 0.67, + "grad_norm": 1.2929610013961792, + "learning_rate": 2.5282060672239016e-06, + "loss": 0.519, + "step": 5266 + }, + { + "epoch": 0.67, + "grad_norm": 1.800912618637085, + "learning_rate": 2.526402545390367e-06, + "loss": 0.6361, + "step": 5267 + }, + { + "epoch": 0.67, + "grad_norm": 2.328589916229248, + "learning_rate": 2.524599449583718e-06, + "loss": 0.6149, + "step": 5268 + }, + { + "epoch": 0.68, + "grad_norm": 1.3592402935028076, + "learning_rate": 2.5227967801144972e-06, + "loss": 0.5711, + "step": 5269 + }, + { + "epoch": 0.68, + "grad_norm": 1.2721518278121948, + "learning_rate": 2.5209945372931798e-06, + "loss": 0.5685, + "step": 5270 + }, + { + "epoch": 0.68, + "grad_norm": 1.2351329326629639, + "learning_rate": 2.519192721430166e-06, + "loss": 0.5586, + "step": 5271 + }, + { + "epoch": 0.68, + "grad_norm": 2.6479928493499756, + "learning_rate": 2.5173913328357836e-06, + "loss": 0.5282, + "step": 5272 + }, + { + "epoch": 0.68, + "grad_norm": 1.0487334728240967, + "learning_rate": 2.515590371820281e-06, + "loss": 0.6852, + "step": 5273 + }, + { + "epoch": 0.68, + "grad_norm": 1.3234119415283203, + "learning_rate": 2.5137898386938396e-06, + "loss": 0.5542, + "step": 5274 + }, + { + "epoch": 0.68, + "grad_norm": 1.2834035158157349, + "learning_rate": 2.511989733766565e-06, + "loss": 0.5604, + "step": 5275 + }, + { + "epoch": 0.68, + "grad_norm": 1.3077657222747803, + "learning_rate": 2.510190057348489e-06, + "loss": 0.656, + "step": 5276 + }, + { + "epoch": 0.68, + "grad_norm": 1.3545429706573486, + "learning_rate": 2.508390809749567e-06, + "loss": 0.5638, + "step": 5277 + }, + { + "epoch": 0.68, + "grad_norm": 1.1976977586746216, + "learning_rate": 2.5065919912796845e-06, + "loss": 0.5891, + "step": 5278 + }, + { + "epoch": 0.68, + "grad_norm": 1.1058452129364014, + "learning_rate": 2.5047936022486503e-06, + "loss": 0.5514, + "step": 5279 + }, + { + "epoch": 0.68, + "grad_norm": 1.2233741283416748, + "learning_rate": 2.5029956429662017e-06, + "loss": 0.5231, + "step": 5280 + }, + { + "epoch": 0.68, + "grad_norm": 1.187469244003296, + "learning_rate": 2.5011981137419993e-06, + "loss": 0.6487, + "step": 5281 + }, + { + "epoch": 0.68, + "grad_norm": 1.358062744140625, + "learning_rate": 2.4994010148856325e-06, + "loss": 0.6192, + "step": 5282 + }, + { + "epoch": 0.68, + "grad_norm": 1.3537427186965942, + "learning_rate": 2.4976043467066164e-06, + "loss": 0.5947, + "step": 5283 + }, + { + "epoch": 0.68, + "grad_norm": 1.1666181087493896, + "learning_rate": 2.495808109514386e-06, + "loss": 0.5528, + "step": 5284 + }, + { + "epoch": 0.68, + "grad_norm": 1.244334101676941, + "learning_rate": 2.4940123036183095e-06, + "loss": 0.5853, + "step": 5285 + }, + { + "epoch": 0.68, + "grad_norm": 1.467995524406433, + "learning_rate": 2.4922169293276793e-06, + "loss": 0.5277, + "step": 5286 + }, + { + "epoch": 0.68, + "grad_norm": 1.112685203552246, + "learning_rate": 2.4904219869517083e-06, + "loss": 0.6663, + "step": 5287 + }, + { + "epoch": 0.68, + "grad_norm": 1.6050626039505005, + "learning_rate": 2.488627476799542e-06, + "loss": 0.583, + "step": 5288 + }, + { + "epoch": 0.68, + "grad_norm": 1.614817500114441, + "learning_rate": 2.486833399180246e-06, + "loss": 0.5568, + "step": 5289 + }, + { + "epoch": 0.68, + "grad_norm": 1.259895920753479, + "learning_rate": 2.485039754402817e-06, + "loss": 0.6091, + "step": 5290 + }, + { + "epoch": 0.68, + "grad_norm": 1.1175788640975952, + "learning_rate": 2.4832465427761704e-06, + "loss": 0.6088, + "step": 5291 + }, + { + "epoch": 0.68, + "grad_norm": 1.3290629386901855, + "learning_rate": 2.481453764609152e-06, + "loss": 0.6004, + "step": 5292 + }, + { + "epoch": 0.68, + "grad_norm": 1.1337230205535889, + "learning_rate": 2.4796614202105313e-06, + "loss": 0.4849, + "step": 5293 + }, + { + "epoch": 0.68, + "grad_norm": 1.3604735136032104, + "learning_rate": 2.477869509889005e-06, + "loss": 0.5763, + "step": 5294 + }, + { + "epoch": 0.68, + "grad_norm": 1.551064372062683, + "learning_rate": 2.476078033953189e-06, + "loss": 0.5731, + "step": 5295 + }, + { + "epoch": 0.68, + "grad_norm": 1.4335790872573853, + "learning_rate": 2.4742869927116297e-06, + "loss": 0.5623, + "step": 5296 + }, + { + "epoch": 0.68, + "grad_norm": 1.4575905799865723, + "learning_rate": 2.4724963864728026e-06, + "loss": 0.6491, + "step": 5297 + }, + { + "epoch": 0.68, + "grad_norm": 1.3047614097595215, + "learning_rate": 2.470706215545097e-06, + "loss": 0.5279, + "step": 5298 + }, + { + "epoch": 0.68, + "grad_norm": 1.3483816385269165, + "learning_rate": 2.4689164802368366e-06, + "loss": 0.636, + "step": 5299 + }, + { + "epoch": 0.68, + "grad_norm": 1.285593032836914, + "learning_rate": 2.4671271808562664e-06, + "loss": 0.5903, + "step": 5300 + }, + { + "epoch": 0.68, + "grad_norm": 1.0162348747253418, + "learning_rate": 2.465338317711558e-06, + "loss": 0.5889, + "step": 5301 + }, + { + "epoch": 0.68, + "grad_norm": 1.4397096633911133, + "learning_rate": 2.4635498911108042e-06, + "loss": 0.5798, + "step": 5302 + }, + { + "epoch": 0.68, + "grad_norm": 1.4754639863967896, + "learning_rate": 2.4617619013620257e-06, + "loss": 0.6134, + "step": 5303 + }, + { + "epoch": 0.68, + "grad_norm": 1.1740292310714722, + "learning_rate": 2.4599743487731686e-06, + "loss": 0.5692, + "step": 5304 + }, + { + "epoch": 0.68, + "grad_norm": 1.4390628337860107, + "learning_rate": 2.458187233652104e-06, + "loss": 0.6033, + "step": 5305 + }, + { + "epoch": 0.68, + "grad_norm": 1.326101541519165, + "learning_rate": 2.456400556306622e-06, + "loss": 0.5816, + "step": 5306 + }, + { + "epoch": 0.68, + "grad_norm": 1.6704646348953247, + "learning_rate": 2.454614317044443e-06, + "loss": 0.5938, + "step": 5307 + }, + { + "epoch": 0.68, + "grad_norm": 1.2074356079101562, + "learning_rate": 2.452828516173212e-06, + "loss": 0.5318, + "step": 5308 + }, + { + "epoch": 0.68, + "grad_norm": 1.418071985244751, + "learning_rate": 2.451043154000497e-06, + "loss": 0.6688, + "step": 5309 + }, + { + "epoch": 0.68, + "grad_norm": 1.3660101890563965, + "learning_rate": 2.449258230833788e-06, + "loss": 0.5901, + "step": 5310 + }, + { + "epoch": 0.68, + "grad_norm": 1.3401819467544556, + "learning_rate": 2.4474737469805026e-06, + "loss": 0.6244, + "step": 5311 + }, + { + "epoch": 0.68, + "grad_norm": 1.1151803731918335, + "learning_rate": 2.445689702747982e-06, + "loss": 0.6078, + "step": 5312 + }, + { + "epoch": 0.68, + "grad_norm": 1.3116505146026611, + "learning_rate": 2.443906098443494e-06, + "loss": 0.5469, + "step": 5313 + }, + { + "epoch": 0.68, + "grad_norm": 1.2464821338653564, + "learning_rate": 2.4421229343742224e-06, + "loss": 0.573, + "step": 5314 + }, + { + "epoch": 0.68, + "grad_norm": 1.6851762533187866, + "learning_rate": 2.4403402108472863e-06, + "loss": 0.6493, + "step": 5315 + }, + { + "epoch": 0.68, + "grad_norm": 1.3235703706741333, + "learning_rate": 2.4385579281697235e-06, + "loss": 0.5459, + "step": 5316 + }, + { + "epoch": 0.68, + "grad_norm": 1.267038106918335, + "learning_rate": 2.436776086648493e-06, + "loss": 0.6437, + "step": 5317 + }, + { + "epoch": 0.68, + "grad_norm": 1.639235258102417, + "learning_rate": 2.4349946865904815e-06, + "loss": 0.5845, + "step": 5318 + }, + { + "epoch": 0.68, + "grad_norm": 1.2454874515533447, + "learning_rate": 2.4332137283024997e-06, + "loss": 0.537, + "step": 5319 + }, + { + "epoch": 0.68, + "grad_norm": 1.361704707145691, + "learning_rate": 2.431433212091284e-06, + "loss": 0.5855, + "step": 5320 + }, + { + "epoch": 0.68, + "grad_norm": 1.4523319005966187, + "learning_rate": 2.4296531382634864e-06, + "loss": 0.626, + "step": 5321 + }, + { + "epoch": 0.68, + "grad_norm": 1.2574588060379028, + "learning_rate": 2.427873507125692e-06, + "loss": 0.6157, + "step": 5322 + }, + { + "epoch": 0.68, + "grad_norm": 2.1008527278900146, + "learning_rate": 2.4260943189844055e-06, + "loss": 0.593, + "step": 5323 + }, + { + "epoch": 0.68, + "grad_norm": 1.3145480155944824, + "learning_rate": 2.4243155741460583e-06, + "loss": 0.5892, + "step": 5324 + }, + { + "epoch": 0.68, + "grad_norm": 1.1647745370864868, + "learning_rate": 2.422537272916998e-06, + "loss": 0.5774, + "step": 5325 + }, + { + "epoch": 0.68, + "grad_norm": 1.1349399089813232, + "learning_rate": 2.4207594156035042e-06, + "loss": 0.5195, + "step": 5326 + }, + { + "epoch": 0.68, + "grad_norm": 1.4423859119415283, + "learning_rate": 2.4189820025117762e-06, + "loss": 0.6294, + "step": 5327 + }, + { + "epoch": 0.68, + "grad_norm": 1.2302815914154053, + "learning_rate": 2.417205033947938e-06, + "loss": 0.5202, + "step": 5328 + }, + { + "epoch": 0.68, + "grad_norm": 1.4219542741775513, + "learning_rate": 2.415428510218035e-06, + "loss": 0.6441, + "step": 5329 + }, + { + "epoch": 0.68, + "grad_norm": 1.2709050178527832, + "learning_rate": 2.413652431628036e-06, + "loss": 0.6086, + "step": 5330 + }, + { + "epoch": 0.68, + "grad_norm": 1.3104877471923828, + "learning_rate": 2.4118767984838376e-06, + "loss": 0.5737, + "step": 5331 + }, + { + "epoch": 0.68, + "grad_norm": 1.25847327709198, + "learning_rate": 2.4101016110912547e-06, + "loss": 0.5736, + "step": 5332 + }, + { + "epoch": 0.68, + "grad_norm": 1.170403003692627, + "learning_rate": 2.4083268697560276e-06, + "loss": 0.5607, + "step": 5333 + }, + { + "epoch": 0.68, + "grad_norm": 1.3111367225646973, + "learning_rate": 2.406552574783821e-06, + "loss": 0.5241, + "step": 5334 + }, + { + "epoch": 0.68, + "grad_norm": 1.4980742931365967, + "learning_rate": 2.404778726480218e-06, + "loss": 0.6526, + "step": 5335 + }, + { + "epoch": 0.68, + "grad_norm": 1.3784537315368652, + "learning_rate": 2.4030053251507298e-06, + "loss": 0.5576, + "step": 5336 + }, + { + "epoch": 0.68, + "grad_norm": 1.2792404890060425, + "learning_rate": 2.401232371100788e-06, + "loss": 0.6557, + "step": 5337 + }, + { + "epoch": 0.68, + "grad_norm": 1.282593846321106, + "learning_rate": 2.3994598646357505e-06, + "loss": 0.5845, + "step": 5338 + }, + { + "epoch": 0.68, + "grad_norm": 1.3621476888656616, + "learning_rate": 2.397687806060891e-06, + "loss": 0.5418, + "step": 5339 + }, + { + "epoch": 0.68, + "grad_norm": 1.4283099174499512, + "learning_rate": 2.3959161956814136e-06, + "loss": 0.588, + "step": 5340 + }, + { + "epoch": 0.68, + "grad_norm": 1.1404139995574951, + "learning_rate": 2.394145033802441e-06, + "loss": 0.577, + "step": 5341 + }, + { + "epoch": 0.68, + "grad_norm": 1.461607813835144, + "learning_rate": 2.392374320729023e-06, + "loss": 0.6742, + "step": 5342 + }, + { + "epoch": 0.68, + "grad_norm": 1.1831786632537842, + "learning_rate": 2.390604056766124e-06, + "loss": 0.6059, + "step": 5343 + }, + { + "epoch": 0.68, + "grad_norm": 1.6188664436340332, + "learning_rate": 2.3888342422186376e-06, + "loss": 0.5266, + "step": 5344 + }, + { + "epoch": 0.68, + "grad_norm": 1.6657450199127197, + "learning_rate": 2.3870648773913796e-06, + "loss": 0.6137, + "step": 5345 + }, + { + "epoch": 0.68, + "grad_norm": 1.1973358392715454, + "learning_rate": 2.3852959625890888e-06, + "loss": 0.6311, + "step": 5346 + }, + { + "epoch": 0.69, + "grad_norm": 1.5780502557754517, + "learning_rate": 2.38352749811642e-06, + "loss": 0.6452, + "step": 5347 + }, + { + "epoch": 0.69, + "grad_norm": 1.6269686222076416, + "learning_rate": 2.3817594842779566e-06, + "loss": 0.5092, + "step": 5348 + }, + { + "epoch": 0.69, + "grad_norm": 1.3493552207946777, + "learning_rate": 2.379991921378208e-06, + "loss": 0.6042, + "step": 5349 + }, + { + "epoch": 0.69, + "grad_norm": 1.241943359375, + "learning_rate": 2.378224809721595e-06, + "loss": 0.5149, + "step": 5350 + }, + { + "epoch": 0.69, + "grad_norm": 1.2631124258041382, + "learning_rate": 2.3764581496124693e-06, + "loss": 0.5638, + "step": 5351 + }, + { + "epoch": 0.69, + "grad_norm": 1.1910855770111084, + "learning_rate": 2.374691941355102e-06, + "loss": 0.5865, + "step": 5352 + }, + { + "epoch": 0.69, + "grad_norm": 4.200222015380859, + "learning_rate": 2.372926185253688e-06, + "loss": 0.5512, + "step": 5353 + }, + { + "epoch": 0.69, + "grad_norm": 1.0969185829162598, + "learning_rate": 2.3711608816123393e-06, + "loss": 0.5978, + "step": 5354 + }, + { + "epoch": 0.69, + "grad_norm": 1.5703184604644775, + "learning_rate": 2.3693960307350953e-06, + "loss": 0.5803, + "step": 5355 + }, + { + "epoch": 0.69, + "grad_norm": 1.877293586730957, + "learning_rate": 2.367631632925917e-06, + "loss": 0.5711, + "step": 5356 + }, + { + "epoch": 0.69, + "grad_norm": 1.0474374294281006, + "learning_rate": 2.365867688488686e-06, + "loss": 0.5735, + "step": 5357 + }, + { + "epoch": 0.69, + "grad_norm": 1.2668507099151611, + "learning_rate": 2.364104197727204e-06, + "loss": 0.5971, + "step": 5358 + }, + { + "epoch": 0.69, + "grad_norm": 1.335062861442566, + "learning_rate": 2.362341160945197e-06, + "loss": 0.5533, + "step": 5359 + }, + { + "epoch": 0.69, + "grad_norm": 1.19700026512146, + "learning_rate": 2.360578578446312e-06, + "loss": 0.5404, + "step": 5360 + }, + { + "epoch": 0.69, + "grad_norm": 1.3640122413635254, + "learning_rate": 2.358816450534121e-06, + "loss": 0.636, + "step": 5361 + }, + { + "epoch": 0.69, + "grad_norm": 1.0688934326171875, + "learning_rate": 2.3570547775121106e-06, + "loss": 0.6033, + "step": 5362 + }, + { + "epoch": 0.69, + "grad_norm": 1.1979340314865112, + "learning_rate": 2.3552935596836947e-06, + "loss": 0.554, + "step": 5363 + }, + { + "epoch": 0.69, + "grad_norm": 1.4938658475875854, + "learning_rate": 2.353532797352207e-06, + "loss": 0.5977, + "step": 5364 + }, + { + "epoch": 0.69, + "grad_norm": 1.255603551864624, + "learning_rate": 2.3517724908209037e-06, + "loss": 0.5904, + "step": 5365 + }, + { + "epoch": 0.69, + "grad_norm": 1.557724118232727, + "learning_rate": 2.3500126403929624e-06, + "loss": 0.6582, + "step": 5366 + }, + { + "epoch": 0.69, + "grad_norm": 1.2635000944137573, + "learning_rate": 2.3482532463714803e-06, + "loss": 0.5664, + "step": 5367 + }, + { + "epoch": 0.69, + "grad_norm": 1.5641952753067017, + "learning_rate": 2.3464943090594794e-06, + "loss": 0.6332, + "step": 5368 + }, + { + "epoch": 0.69, + "grad_norm": 1.1680552959442139, + "learning_rate": 2.3447358287598975e-06, + "loss": 0.625, + "step": 5369 + }, + { + "epoch": 0.69, + "grad_norm": 1.9742357730865479, + "learning_rate": 2.3429778057755994e-06, + "loss": 0.5614, + "step": 5370 + }, + { + "epoch": 0.69, + "grad_norm": 1.2470797300338745, + "learning_rate": 2.3412202404093675e-06, + "loss": 0.5732, + "step": 5371 + }, + { + "epoch": 0.69, + "grad_norm": 1.2727478742599487, + "learning_rate": 2.3394631329639094e-06, + "loss": 0.5777, + "step": 5372 + }, + { + "epoch": 0.69, + "grad_norm": 1.1417211294174194, + "learning_rate": 2.337706483741848e-06, + "loss": 0.6026, + "step": 5373 + }, + { + "epoch": 0.69, + "grad_norm": 1.1219482421875, + "learning_rate": 2.3359502930457306e-06, + "loss": 0.5525, + "step": 5374 + }, + { + "epoch": 0.69, + "grad_norm": 1.3681706190109253, + "learning_rate": 2.334194561178027e-06, + "loss": 0.5578, + "step": 5375 + }, + { + "epoch": 0.69, + "grad_norm": 1.2046383619308472, + "learning_rate": 2.3324392884411277e-06, + "loss": 0.5576, + "step": 5376 + }, + { + "epoch": 0.69, + "grad_norm": 3.306382894515991, + "learning_rate": 2.3306844751373384e-06, + "loss": 0.5266, + "step": 5377 + }, + { + "epoch": 0.69, + "grad_norm": 1.108659267425537, + "learning_rate": 2.328930121568893e-06, + "loss": 0.6962, + "step": 5378 + }, + { + "epoch": 0.69, + "grad_norm": 1.2322014570236206, + "learning_rate": 2.3271762280379446e-06, + "loss": 0.5757, + "step": 5379 + }, + { + "epoch": 0.69, + "grad_norm": 1.400467038154602, + "learning_rate": 2.3254227948465613e-06, + "loss": 0.5913, + "step": 5380 + }, + { + "epoch": 0.69, + "grad_norm": 1.3882756233215332, + "learning_rate": 2.3236698222967392e-06, + "loss": 0.5503, + "step": 5381 + }, + { + "epoch": 0.69, + "grad_norm": 1.5921300649642944, + "learning_rate": 2.321917310690392e-06, + "loss": 0.5449, + "step": 5382 + }, + { + "epoch": 0.69, + "grad_norm": 1.7357591390609741, + "learning_rate": 2.3201652603293535e-06, + "loss": 0.5941, + "step": 5383 + }, + { + "epoch": 0.69, + "grad_norm": 1.2445812225341797, + "learning_rate": 2.318413671515379e-06, + "loss": 0.5878, + "step": 5384 + }, + { + "epoch": 0.69, + "grad_norm": 1.2913782596588135, + "learning_rate": 2.3166625445501444e-06, + "loss": 0.6032, + "step": 5385 + }, + { + "epoch": 0.69, + "grad_norm": 1.1985303163528442, + "learning_rate": 2.3149118797352484e-06, + "loss": 0.5549, + "step": 5386 + }, + { + "epoch": 0.69, + "grad_norm": 2.0271339416503906, + "learning_rate": 2.3131616773722015e-06, + "loss": 0.5392, + "step": 5387 + }, + { + "epoch": 0.69, + "grad_norm": 1.3953230381011963, + "learning_rate": 2.3114119377624444e-06, + "loss": 0.6916, + "step": 5388 + }, + { + "epoch": 0.69, + "grad_norm": 1.073756456375122, + "learning_rate": 2.309662661207334e-06, + "loss": 0.6164, + "step": 5389 + }, + { + "epoch": 0.69, + "grad_norm": 1.0587557554244995, + "learning_rate": 2.3079138480081474e-06, + "loss": 0.5769, + "step": 5390 + }, + { + "epoch": 0.69, + "grad_norm": 1.3357759714126587, + "learning_rate": 2.3061654984660808e-06, + "loss": 0.6175, + "step": 5391 + }, + { + "epoch": 0.69, + "grad_norm": 1.4631446599960327, + "learning_rate": 2.3044176128822533e-06, + "loss": 0.5939, + "step": 5392 + }, + { + "epoch": 0.69, + "grad_norm": 1.3898251056671143, + "learning_rate": 2.3026701915577017e-06, + "loss": 0.5235, + "step": 5393 + }, + { + "epoch": 0.69, + "grad_norm": 1.3785183429718018, + "learning_rate": 2.3009232347933858e-06, + "loss": 0.647, + "step": 5394 + }, + { + "epoch": 0.69, + "grad_norm": 1.1423410177230835, + "learning_rate": 2.299176742890181e-06, + "loss": 0.5851, + "step": 5395 + }, + { + "epoch": 0.69, + "grad_norm": 1.523036003112793, + "learning_rate": 2.297430716148885e-06, + "loss": 0.5871, + "step": 5396 + }, + { + "epoch": 0.69, + "grad_norm": 1.1296528577804565, + "learning_rate": 2.295685154870217e-06, + "loss": 0.5476, + "step": 5397 + }, + { + "epoch": 0.69, + "grad_norm": 1.4960590600967407, + "learning_rate": 2.2939400593548157e-06, + "loss": 0.6542, + "step": 5398 + }, + { + "epoch": 0.69, + "grad_norm": 1.2516299486160278, + "learning_rate": 2.2921954299032324e-06, + "loss": 0.5171, + "step": 5399 + }, + { + "epoch": 0.69, + "grad_norm": 1.3827989101409912, + "learning_rate": 2.29045126681595e-06, + "loss": 0.6073, + "step": 5400 + }, + { + "epoch": 0.69, + "grad_norm": 1.7401831150054932, + "learning_rate": 2.2887075703933654e-06, + "loss": 0.5981, + "step": 5401 + }, + { + "epoch": 0.69, + "grad_norm": 1.0633769035339355, + "learning_rate": 2.2869643409357907e-06, + "loss": 0.7061, + "step": 5402 + }, + { + "epoch": 0.69, + "grad_norm": 1.2310799360275269, + "learning_rate": 2.2852215787434636e-06, + "loss": 0.5535, + "step": 5403 + }, + { + "epoch": 0.69, + "grad_norm": 1.2269682884216309, + "learning_rate": 2.28347928411654e-06, + "loss": 0.606, + "step": 5404 + }, + { + "epoch": 0.69, + "grad_norm": 1.3008508682250977, + "learning_rate": 2.2817374573550956e-06, + "loss": 0.5632, + "step": 5405 + }, + { + "epoch": 0.69, + "grad_norm": 1.3585628271102905, + "learning_rate": 2.2799960987591217e-06, + "loss": 0.604, + "step": 5406 + }, + { + "epoch": 0.69, + "grad_norm": 1.6831282377243042, + "learning_rate": 2.278255208628534e-06, + "loss": 0.6772, + "step": 5407 + }, + { + "epoch": 0.69, + "grad_norm": 1.3406838178634644, + "learning_rate": 2.2765147872631656e-06, + "loss": 0.6374, + "step": 5408 + }, + { + "epoch": 0.69, + "grad_norm": 1.5567865371704102, + "learning_rate": 2.274774834962769e-06, + "loss": 0.6538, + "step": 5409 + }, + { + "epoch": 0.69, + "grad_norm": 1.564103364944458, + "learning_rate": 2.273035352027013e-06, + "loss": 0.6838, + "step": 5410 + }, + { + "epoch": 0.69, + "grad_norm": 1.5919641256332397, + "learning_rate": 2.271296338755491e-06, + "loss": 0.6681, + "step": 5411 + }, + { + "epoch": 0.69, + "grad_norm": 1.4023327827453613, + "learning_rate": 2.269557795447711e-06, + "loss": 0.5633, + "step": 5412 + }, + { + "epoch": 0.69, + "grad_norm": 1.1743390560150146, + "learning_rate": 2.267819722403104e-06, + "loss": 0.519, + "step": 5413 + }, + { + "epoch": 0.69, + "grad_norm": 1.3634856939315796, + "learning_rate": 2.2660821199210147e-06, + "loss": 0.6134, + "step": 5414 + }, + { + "epoch": 0.69, + "grad_norm": 1.3316655158996582, + "learning_rate": 2.2643449883007113e-06, + "loss": 0.6312, + "step": 5415 + }, + { + "epoch": 0.69, + "grad_norm": 1.128868818283081, + "learning_rate": 2.26260832784138e-06, + "loss": 0.45, + "step": 5416 + }, + { + "epoch": 0.69, + "grad_norm": 1.427899718284607, + "learning_rate": 2.2608721388421233e-06, + "loss": 0.6127, + "step": 5417 + }, + { + "epoch": 0.69, + "grad_norm": 16.15768051147461, + "learning_rate": 2.259136421601967e-06, + "loss": 0.6433, + "step": 5418 + }, + { + "epoch": 0.69, + "grad_norm": 1.452759027481079, + "learning_rate": 2.257401176419851e-06, + "loss": 0.5636, + "step": 5419 + }, + { + "epoch": 0.69, + "grad_norm": 1.5467267036437988, + "learning_rate": 2.2556664035946396e-06, + "loss": 0.6037, + "step": 5420 + }, + { + "epoch": 0.69, + "grad_norm": 1.701102375984192, + "learning_rate": 2.253932103425107e-06, + "loss": 0.5748, + "step": 5421 + }, + { + "epoch": 0.69, + "grad_norm": 1.346367359161377, + "learning_rate": 2.2521982762099544e-06, + "loss": 0.6572, + "step": 5422 + }, + { + "epoch": 0.69, + "grad_norm": 1.1707795858383179, + "learning_rate": 2.2504649222477977e-06, + "loss": 0.6369, + "step": 5423 + }, + { + "epoch": 0.69, + "grad_norm": 1.1806703805923462, + "learning_rate": 2.248732041837173e-06, + "loss": 0.616, + "step": 5424 + }, + { + "epoch": 0.7, + "grad_norm": 1.6231908798217773, + "learning_rate": 2.2469996352765307e-06, + "loss": 0.5244, + "step": 5425 + }, + { + "epoch": 0.7, + "grad_norm": 1.244320034980774, + "learning_rate": 2.2452677028642445e-06, + "loss": 0.6273, + "step": 5426 + }, + { + "epoch": 0.7, + "grad_norm": 5.8792548179626465, + "learning_rate": 2.2435362448986058e-06, + "loss": 0.6268, + "step": 5427 + }, + { + "epoch": 0.7, + "grad_norm": 1.297541618347168, + "learning_rate": 2.241805261677821e-06, + "loss": 0.5361, + "step": 5428 + }, + { + "epoch": 0.7, + "grad_norm": 1.4641773700714111, + "learning_rate": 2.240074753500017e-06, + "loss": 0.5059, + "step": 5429 + }, + { + "epoch": 0.7, + "grad_norm": 1.5284756422042847, + "learning_rate": 2.2383447206632385e-06, + "loss": 0.5743, + "step": 5430 + }, + { + "epoch": 0.7, + "grad_norm": 1.548749327659607, + "learning_rate": 2.2366151634654513e-06, + "loss": 0.6469, + "step": 5431 + }, + { + "epoch": 0.7, + "grad_norm": 1.4602359533309937, + "learning_rate": 2.2348860822045327e-06, + "loss": 0.5945, + "step": 5432 + }, + { + "epoch": 0.7, + "grad_norm": 1.2962963581085205, + "learning_rate": 2.233157477178281e-06, + "loss": 0.63, + "step": 5433 + }, + { + "epoch": 0.7, + "grad_norm": 1.1280606985092163, + "learning_rate": 2.23142934868442e-06, + "loss": 0.5783, + "step": 5434 + }, + { + "epoch": 0.7, + "grad_norm": 1.4649392366409302, + "learning_rate": 2.229701697020578e-06, + "loss": 0.6784, + "step": 5435 + }, + { + "epoch": 0.7, + "grad_norm": 1.415880799293518, + "learning_rate": 2.2279745224843108e-06, + "loss": 0.6329, + "step": 5436 + }, + { + "epoch": 0.7, + "grad_norm": 1.3583704233169556, + "learning_rate": 2.2262478253730885e-06, + "loss": 0.5051, + "step": 5437 + }, + { + "epoch": 0.7, + "grad_norm": 1.3856008052825928, + "learning_rate": 2.224521605984302e-06, + "loss": 0.6311, + "step": 5438 + }, + { + "epoch": 0.7, + "grad_norm": 1.266484022140503, + "learning_rate": 2.2227958646152525e-06, + "loss": 0.6008, + "step": 5439 + }, + { + "epoch": 0.7, + "grad_norm": 1.2216291427612305, + "learning_rate": 2.2210706015631666e-06, + "loss": 0.5565, + "step": 5440 + }, + { + "epoch": 0.7, + "grad_norm": 1.1825257539749146, + "learning_rate": 2.2193458171251857e-06, + "loss": 0.6352, + "step": 5441 + }, + { + "epoch": 0.7, + "grad_norm": 0.9359205365180969, + "learning_rate": 2.217621511598371e-06, + "loss": 0.5556, + "step": 5442 + }, + { + "epoch": 0.7, + "grad_norm": 1.3200385570526123, + "learning_rate": 2.2158976852796956e-06, + "loss": 0.5984, + "step": 5443 + }, + { + "epoch": 0.7, + "grad_norm": 1.6388661861419678, + "learning_rate": 2.2141743384660542e-06, + "loss": 0.6564, + "step": 5444 + }, + { + "epoch": 0.7, + "grad_norm": 1.454256296157837, + "learning_rate": 2.212451471454259e-06, + "loss": 0.5588, + "step": 5445 + }, + { + "epoch": 0.7, + "grad_norm": 1.3445405960083008, + "learning_rate": 2.2107290845410405e-06, + "loss": 0.5404, + "step": 5446 + }, + { + "epoch": 0.7, + "grad_norm": 1.1842902898788452, + "learning_rate": 2.209007178023042e-06, + "loss": 0.6045, + "step": 5447 + }, + { + "epoch": 0.7, + "grad_norm": 1.4644899368286133, + "learning_rate": 2.2072857521968276e-06, + "loss": 0.5565, + "step": 5448 + }, + { + "epoch": 0.7, + "grad_norm": 1.2165268659591675, + "learning_rate": 2.205564807358878e-06, + "loss": 0.6259, + "step": 5449 + }, + { + "epoch": 0.7, + "grad_norm": 1.3028157949447632, + "learning_rate": 2.203844343805594e-06, + "loss": 0.5122, + "step": 5450 + }, + { + "epoch": 0.7, + "grad_norm": 1.135292410850525, + "learning_rate": 2.2021243618332833e-06, + "loss": 0.54, + "step": 5451 + }, + { + "epoch": 0.7, + "grad_norm": 1.196251630783081, + "learning_rate": 2.2004048617381844e-06, + "loss": 0.4666, + "step": 5452 + }, + { + "epoch": 0.7, + "grad_norm": 1.4005775451660156, + "learning_rate": 2.1986858438164464e-06, + "loss": 0.5846, + "step": 5453 + }, + { + "epoch": 0.7, + "grad_norm": 1.7641587257385254, + "learning_rate": 2.196967308364131e-06, + "loss": 0.578, + "step": 5454 + }, + { + "epoch": 0.7, + "grad_norm": 1.2260916233062744, + "learning_rate": 2.1952492556772226e-06, + "loss": 0.5843, + "step": 5455 + }, + { + "epoch": 0.7, + "grad_norm": 1.4907604455947876, + "learning_rate": 2.1935316860516214e-06, + "loss": 0.6536, + "step": 5456 + }, + { + "epoch": 0.7, + "grad_norm": 1.4494532346725464, + "learning_rate": 2.191814599783146e-06, + "loss": 0.5521, + "step": 5457 + }, + { + "epoch": 0.7, + "grad_norm": 1.3400883674621582, + "learning_rate": 2.190097997167525e-06, + "loss": 0.576, + "step": 5458 + }, + { + "epoch": 0.7, + "grad_norm": 1.1813629865646362, + "learning_rate": 2.1883818785004108e-06, + "loss": 0.7461, + "step": 5459 + }, + { + "epoch": 0.7, + "grad_norm": 1.175892949104309, + "learning_rate": 2.1866662440773694e-06, + "loss": 0.6967, + "step": 5460 + }, + { + "epoch": 0.7, + "grad_norm": 1.5057560205459595, + "learning_rate": 2.1849510941938864e-06, + "loss": 0.637, + "step": 5461 + }, + { + "epoch": 0.7, + "grad_norm": 1.2532135248184204, + "learning_rate": 2.1832364291453572e-06, + "loss": 0.6955, + "step": 5462 + }, + { + "epoch": 0.7, + "grad_norm": 1.2681691646575928, + "learning_rate": 2.181522249227101e-06, + "loss": 0.6327, + "step": 5463 + }, + { + "epoch": 0.7, + "grad_norm": 1.4270204305648804, + "learning_rate": 2.179808554734349e-06, + "loss": 0.5954, + "step": 5464 + }, + { + "epoch": 0.7, + "grad_norm": 1.4886845350265503, + "learning_rate": 2.1780953459622526e-06, + "loss": 0.5569, + "step": 5465 + }, + { + "epoch": 0.7, + "grad_norm": 1.3294905424118042, + "learning_rate": 2.1763826232058745e-06, + "loss": 0.6214, + "step": 5466 + }, + { + "epoch": 0.7, + "grad_norm": 1.3741010427474976, + "learning_rate": 2.174670386760197e-06, + "loss": 0.6151, + "step": 5467 + }, + { + "epoch": 0.7, + "grad_norm": 2.0721983909606934, + "learning_rate": 2.1729586369201184e-06, + "loss": 0.7426, + "step": 5468 + }, + { + "epoch": 0.7, + "grad_norm": 1.39492666721344, + "learning_rate": 2.1712473739804524e-06, + "loss": 0.6307, + "step": 5469 + }, + { + "epoch": 0.7, + "grad_norm": 1.437631368637085, + "learning_rate": 2.16953659823593e-06, + "loss": 0.5988, + "step": 5470 + }, + { + "epoch": 0.7, + "grad_norm": 1.322919487953186, + "learning_rate": 2.1678263099811973e-06, + "loss": 0.6527, + "step": 5471 + }, + { + "epoch": 0.7, + "grad_norm": 1.1208932399749756, + "learning_rate": 2.166116509510819e-06, + "loss": 0.5896, + "step": 5472 + }, + { + "epoch": 0.7, + "grad_norm": 1.3424168825149536, + "learning_rate": 2.1644071971192687e-06, + "loss": 0.5752, + "step": 5473 + }, + { + "epoch": 0.7, + "grad_norm": 1.3916758298873901, + "learning_rate": 2.162698373100943e-06, + "loss": 0.6444, + "step": 5474 + }, + { + "epoch": 0.7, + "grad_norm": 1.0904909372329712, + "learning_rate": 2.1609900377501526e-06, + "loss": 0.6417, + "step": 5475 + }, + { + "epoch": 0.7, + "grad_norm": 1.8144199848175049, + "learning_rate": 2.1592821913611246e-06, + "loss": 0.5684, + "step": 5476 + }, + { + "epoch": 0.7, + "grad_norm": 1.6254833936691284, + "learning_rate": 2.157574834227998e-06, + "loss": 0.6259, + "step": 5477 + }, + { + "epoch": 0.7, + "grad_norm": 1.1566519737243652, + "learning_rate": 2.1558679666448314e-06, + "loss": 0.5346, + "step": 5478 + }, + { + "epoch": 0.7, + "grad_norm": 1.5531538724899292, + "learning_rate": 2.1541615889056e-06, + "loss": 0.6, + "step": 5479 + }, + { + "epoch": 0.7, + "grad_norm": 1.1958260536193848, + "learning_rate": 2.1524557013041897e-06, + "loss": 0.6024, + "step": 5480 + }, + { + "epoch": 0.7, + "grad_norm": 1.4889098405838013, + "learning_rate": 2.1507503041344063e-06, + "loss": 0.6399, + "step": 5481 + }, + { + "epoch": 0.7, + "grad_norm": 1.2659668922424316, + "learning_rate": 2.14904539768997e-06, + "loss": 0.5371, + "step": 5482 + }, + { + "epoch": 0.7, + "grad_norm": 1.2588521242141724, + "learning_rate": 2.1473409822645174e-06, + "loss": 0.6951, + "step": 5483 + }, + { + "epoch": 0.7, + "grad_norm": 1.2568881511688232, + "learning_rate": 2.1456370581515967e-06, + "loss": 0.5739, + "step": 5484 + }, + { + "epoch": 0.7, + "grad_norm": 1.5543744564056396, + "learning_rate": 2.143933625644674e-06, + "loss": 0.5692, + "step": 5485 + }, + { + "epoch": 0.7, + "grad_norm": 1.441428542137146, + "learning_rate": 2.142230685037136e-06, + "loss": 0.6467, + "step": 5486 + }, + { + "epoch": 0.7, + "grad_norm": 1.2142436504364014, + "learning_rate": 2.140528236622274e-06, + "loss": 0.4952, + "step": 5487 + }, + { + "epoch": 0.7, + "grad_norm": 1.1973832845687866, + "learning_rate": 2.138826280693303e-06, + "loss": 0.5627, + "step": 5488 + }, + { + "epoch": 0.7, + "grad_norm": 1.733954668045044, + "learning_rate": 2.137124817543349e-06, + "loss": 0.604, + "step": 5489 + }, + { + "epoch": 0.7, + "grad_norm": 1.3623613119125366, + "learning_rate": 2.1354238474654574e-06, + "loss": 0.5886, + "step": 5490 + }, + { + "epoch": 0.7, + "grad_norm": 1.325785517692566, + "learning_rate": 2.133723370752581e-06, + "loss": 0.578, + "step": 5491 + }, + { + "epoch": 0.7, + "grad_norm": 1.2375460863113403, + "learning_rate": 2.1320233876975955e-06, + "loss": 0.5326, + "step": 5492 + }, + { + "epoch": 0.7, + "grad_norm": 1.4318372011184692, + "learning_rate": 2.130323898593287e-06, + "loss": 0.5755, + "step": 5493 + }, + { + "epoch": 0.7, + "grad_norm": 1.3102978467941284, + "learning_rate": 2.128624903732361e-06, + "loss": 0.6431, + "step": 5494 + }, + { + "epoch": 0.7, + "grad_norm": 1.4908846616744995, + "learning_rate": 2.12692640340743e-06, + "loss": 0.5952, + "step": 5495 + }, + { + "epoch": 0.7, + "grad_norm": 1.2359824180603027, + "learning_rate": 2.1252283979110292e-06, + "loss": 0.6169, + "step": 5496 + }, + { + "epoch": 0.7, + "grad_norm": 1.3551256656646729, + "learning_rate": 2.1235308875356048e-06, + "loss": 0.6417, + "step": 5497 + }, + { + "epoch": 0.7, + "grad_norm": 1.139798641204834, + "learning_rate": 2.1218338725735203e-06, + "loss": 0.5594, + "step": 5498 + }, + { + "epoch": 0.7, + "grad_norm": 1.2601267099380493, + "learning_rate": 2.1201373533170484e-06, + "loss": 0.6317, + "step": 5499 + }, + { + "epoch": 0.7, + "grad_norm": 1.3188377618789673, + "learning_rate": 2.1184413300583823e-06, + "loss": 0.6327, + "step": 5500 + }, + { + "epoch": 0.7, + "grad_norm": 1.4169940948486328, + "learning_rate": 2.116745803089627e-06, + "loss": 0.6216, + "step": 5501 + }, + { + "epoch": 0.7, + "grad_norm": 1.5327837467193604, + "learning_rate": 2.115050772702803e-06, + "loss": 0.6618, + "step": 5502 + }, + { + "epoch": 0.71, + "grad_norm": 1.3896936178207397, + "learning_rate": 2.1133562391898444e-06, + "loss": 0.5753, + "step": 5503 + }, + { + "epoch": 0.71, + "grad_norm": 1.2866517305374146, + "learning_rate": 2.1116622028426004e-06, + "loss": 0.6075, + "step": 5504 + }, + { + "epoch": 0.71, + "grad_norm": 1.4563747644424438, + "learning_rate": 2.109968663952836e-06, + "loss": 0.6092, + "step": 5505 + }, + { + "epoch": 0.71, + "grad_norm": 1.2761017084121704, + "learning_rate": 2.108275622812226e-06, + "loss": 0.6336, + "step": 5506 + }, + { + "epoch": 0.71, + "grad_norm": 1.3175703287124634, + "learning_rate": 2.1065830797123628e-06, + "loss": 0.5825, + "step": 5507 + }, + { + "epoch": 0.71, + "grad_norm": 1.1784989833831787, + "learning_rate": 2.1048910349447533e-06, + "loss": 0.5514, + "step": 5508 + }, + { + "epoch": 0.71, + "grad_norm": 1.3874294757843018, + "learning_rate": 2.1031994888008196e-06, + "loss": 0.5477, + "step": 5509 + }, + { + "epoch": 0.71, + "grad_norm": 1.0878545045852661, + "learning_rate": 2.1015084415718926e-06, + "loss": 0.5669, + "step": 5510 + }, + { + "epoch": 0.71, + "grad_norm": 1.1752663850784302, + "learning_rate": 2.0998178935492223e-06, + "loss": 0.5278, + "step": 5511 + }, + { + "epoch": 0.71, + "grad_norm": 1.5478566884994507, + "learning_rate": 2.098127845023972e-06, + "loss": 0.5717, + "step": 5512 + }, + { + "epoch": 0.71, + "grad_norm": 1.4689644575119019, + "learning_rate": 2.0964382962872187e-06, + "loss": 0.6032, + "step": 5513 + }, + { + "epoch": 0.71, + "grad_norm": 1.3008317947387695, + "learning_rate": 2.0947492476299503e-06, + "loss": 0.6083, + "step": 5514 + }, + { + "epoch": 0.71, + "grad_norm": 1.0789536237716675, + "learning_rate": 2.0930606993430722e-06, + "loss": 0.5137, + "step": 5515 + }, + { + "epoch": 0.71, + "grad_norm": 1.4409828186035156, + "learning_rate": 2.091372651717404e-06, + "loss": 0.6374, + "step": 5516 + }, + { + "epoch": 0.71, + "grad_norm": 1.2810938358306885, + "learning_rate": 2.0896851050436774e-06, + "loss": 0.6584, + "step": 5517 + }, + { + "epoch": 0.71, + "grad_norm": 1.3894275426864624, + "learning_rate": 2.0879980596125353e-06, + "loss": 0.5872, + "step": 5518 + }, + { + "epoch": 0.71, + "grad_norm": 1.4865697622299194, + "learning_rate": 2.086311515714537e-06, + "loss": 0.4925, + "step": 5519 + }, + { + "epoch": 0.71, + "grad_norm": 1.483933448791504, + "learning_rate": 2.084625473640161e-06, + "loss": 0.6136, + "step": 5520 + }, + { + "epoch": 0.71, + "grad_norm": 1.3818782567977905, + "learning_rate": 2.0829399336797883e-06, + "loss": 0.6241, + "step": 5521 + }, + { + "epoch": 0.71, + "grad_norm": 1.9844701290130615, + "learning_rate": 2.08125489612372e-06, + "loss": 0.641, + "step": 5522 + }, + { + "epoch": 0.71, + "grad_norm": 1.4045764207839966, + "learning_rate": 2.079570361262171e-06, + "loss": 0.5807, + "step": 5523 + }, + { + "epoch": 0.71, + "grad_norm": 1.5632199048995972, + "learning_rate": 2.0778863293852686e-06, + "loss": 0.607, + "step": 5524 + }, + { + "epoch": 0.71, + "grad_norm": 1.273563265800476, + "learning_rate": 2.07620280078305e-06, + "loss": 0.5969, + "step": 5525 + }, + { + "epoch": 0.71, + "grad_norm": 1.6909412145614624, + "learning_rate": 2.0745197757454704e-06, + "loss": 0.6227, + "step": 5526 + }, + { + "epoch": 0.71, + "grad_norm": 1.1659879684448242, + "learning_rate": 2.0728372545623987e-06, + "loss": 0.559, + "step": 5527 + }, + { + "epoch": 0.71, + "grad_norm": 2.438142776489258, + "learning_rate": 2.0711552375236115e-06, + "loss": 0.5949, + "step": 5528 + }, + { + "epoch": 0.71, + "grad_norm": 1.3141118288040161, + "learning_rate": 2.0694737249188036e-06, + "loss": 0.4984, + "step": 5529 + }, + { + "epoch": 0.71, + "grad_norm": 1.1047041416168213, + "learning_rate": 2.0677927170375812e-06, + "loss": 0.6739, + "step": 5530 + }, + { + "epoch": 0.71, + "grad_norm": 1.4703797101974487, + "learning_rate": 2.0661122141694655e-06, + "loss": 0.5354, + "step": 5531 + }, + { + "epoch": 0.71, + "grad_norm": 1.1246980428695679, + "learning_rate": 2.0644322166038867e-06, + "loss": 0.5546, + "step": 5532 + }, + { + "epoch": 0.71, + "grad_norm": 1.181230902671814, + "learning_rate": 2.06275272463019e-06, + "loss": 0.6074, + "step": 5533 + }, + { + "epoch": 0.71, + "grad_norm": 1.4038329124450684, + "learning_rate": 2.061073738537635e-06, + "loss": 0.5793, + "step": 5534 + }, + { + "epoch": 0.71, + "grad_norm": 1.2831275463104248, + "learning_rate": 2.059395258615394e-06, + "loss": 0.6463, + "step": 5535 + }, + { + "epoch": 0.71, + "grad_norm": 1.1998370885849, + "learning_rate": 2.057717285152547e-06, + "loss": 0.743, + "step": 5536 + }, + { + "epoch": 0.71, + "grad_norm": 1.8671464920043945, + "learning_rate": 2.056039818438095e-06, + "loss": 0.5823, + "step": 5537 + }, + { + "epoch": 0.71, + "grad_norm": 1.3943527936935425, + "learning_rate": 2.0543628587609472e-06, + "loss": 0.5931, + "step": 5538 + }, + { + "epoch": 0.71, + "grad_norm": 1.3470430374145508, + "learning_rate": 2.052686406409923e-06, + "loss": 0.5477, + "step": 5539 + }, + { + "epoch": 0.71, + "grad_norm": 1.0583560466766357, + "learning_rate": 2.0510104616737597e-06, + "loss": 0.6012, + "step": 5540 + }, + { + "epoch": 0.71, + "grad_norm": 1.5672448873519897, + "learning_rate": 2.0493350248411033e-06, + "loss": 0.6135, + "step": 5541 + }, + { + "epoch": 0.71, + "grad_norm": 1.6713660955429077, + "learning_rate": 2.047660096200516e-06, + "loss": 0.653, + "step": 5542 + }, + { + "epoch": 0.71, + "grad_norm": 1.2293089628219604, + "learning_rate": 2.045985676040467e-06, + "loss": 0.5229, + "step": 5543 + }, + { + "epoch": 0.71, + "grad_norm": 1.250230312347412, + "learning_rate": 2.044311764649343e-06, + "loss": 0.6772, + "step": 5544 + }, + { + "epoch": 0.71, + "grad_norm": 1.6269879341125488, + "learning_rate": 2.042638362315441e-06, + "loss": 0.6468, + "step": 5545 + }, + { + "epoch": 0.71, + "grad_norm": 1.4112759828567505, + "learning_rate": 2.040965469326972e-06, + "loss": 0.6261, + "step": 5546 + }, + { + "epoch": 0.71, + "grad_norm": 1.4061042070388794, + "learning_rate": 2.0392930859720545e-06, + "loss": 0.5911, + "step": 5547 + }, + { + "epoch": 0.71, + "grad_norm": 1.2331053018569946, + "learning_rate": 2.037621212538724e-06, + "loss": 0.5203, + "step": 5548 + }, + { + "epoch": 0.71, + "grad_norm": 1.3158512115478516, + "learning_rate": 2.035949849314928e-06, + "loss": 0.4839, + "step": 5549 + }, + { + "epoch": 0.71, + "grad_norm": 1.7110259532928467, + "learning_rate": 2.0342789965885246e-06, + "loss": 0.601, + "step": 5550 + }, + { + "epoch": 0.71, + "grad_norm": 1.6905916929244995, + "learning_rate": 2.0326086546472823e-06, + "loss": 0.6474, + "step": 5551 + }, + { + "epoch": 0.71, + "grad_norm": 1.428787112236023, + "learning_rate": 2.0309388237788847e-06, + "loss": 0.5905, + "step": 5552 + }, + { + "epoch": 0.71, + "grad_norm": 1.4208601713180542, + "learning_rate": 2.029269504270926e-06, + "loss": 0.6783, + "step": 5553 + }, + { + "epoch": 0.71, + "grad_norm": 1.5845845937728882, + "learning_rate": 2.027600696410913e-06, + "loss": 0.6614, + "step": 5554 + }, + { + "epoch": 0.71, + "grad_norm": 1.4801743030548096, + "learning_rate": 2.025932400486264e-06, + "loss": 0.6549, + "step": 5555 + }, + { + "epoch": 0.71, + "grad_norm": 1.0526586771011353, + "learning_rate": 2.0242646167843083e-06, + "loss": 0.5649, + "step": 5556 + }, + { + "epoch": 0.71, + "grad_norm": 1.4187215566635132, + "learning_rate": 2.022597345592291e-06, + "loss": 0.6142, + "step": 5557 + }, + { + "epoch": 0.71, + "grad_norm": 1.489784836769104, + "learning_rate": 2.02093058719736e-06, + "loss": 0.5635, + "step": 5558 + }, + { + "epoch": 0.71, + "grad_norm": 1.1324917078018188, + "learning_rate": 2.0192643418865843e-06, + "loss": 0.512, + "step": 5559 + }, + { + "epoch": 0.71, + "grad_norm": 1.250471591949463, + "learning_rate": 2.0175986099469395e-06, + "loss": 0.6027, + "step": 5560 + }, + { + "epoch": 0.71, + "grad_norm": 1.311180830001831, + "learning_rate": 2.0159333916653166e-06, + "loss": 0.5209, + "step": 5561 + }, + { + "epoch": 0.71, + "grad_norm": 1.4973576068878174, + "learning_rate": 2.0142686873285124e-06, + "loss": 0.5737, + "step": 5562 + }, + { + "epoch": 0.71, + "grad_norm": 1.524202823638916, + "learning_rate": 2.012604497223239e-06, + "loss": 0.5775, + "step": 5563 + }, + { + "epoch": 0.71, + "grad_norm": 1.5545145273208618, + "learning_rate": 2.01094082163612e-06, + "loss": 0.6578, + "step": 5564 + }, + { + "epoch": 0.71, + "grad_norm": 1.5071074962615967, + "learning_rate": 2.009277660853691e-06, + "loss": 0.7238, + "step": 5565 + }, + { + "epoch": 0.71, + "grad_norm": 1.6860618591308594, + "learning_rate": 2.007615015162395e-06, + "loss": 0.6873, + "step": 5566 + }, + { + "epoch": 0.71, + "grad_norm": 1.1676408052444458, + "learning_rate": 2.0059528848485895e-06, + "loss": 0.6382, + "step": 5567 + }, + { + "epoch": 0.71, + "grad_norm": 1.1281388998031616, + "learning_rate": 2.0042912701985436e-06, + "loss": 0.6241, + "step": 5568 + }, + { + "epoch": 0.71, + "grad_norm": 1.5055862665176392, + "learning_rate": 2.002630171498438e-06, + "loss": 0.6282, + "step": 5569 + }, + { + "epoch": 0.71, + "grad_norm": 1.5386683940887451, + "learning_rate": 2.0009695890343583e-06, + "loss": 0.6475, + "step": 5570 + }, + { + "epoch": 0.71, + "grad_norm": 1.3176437616348267, + "learning_rate": 1.999309523092311e-06, + "loss": 0.5033, + "step": 5571 + }, + { + "epoch": 0.71, + "grad_norm": 1.1751028299331665, + "learning_rate": 1.997649973958208e-06, + "loss": 0.5784, + "step": 5572 + }, + { + "epoch": 0.71, + "grad_norm": 1.2589142322540283, + "learning_rate": 1.9959909419178713e-06, + "loss": 0.6164, + "step": 5573 + }, + { + "epoch": 0.71, + "grad_norm": 1.3741481304168701, + "learning_rate": 1.9943324272570356e-06, + "loss": 0.5355, + "step": 5574 + }, + { + "epoch": 0.71, + "grad_norm": 1.4822330474853516, + "learning_rate": 1.992674430261349e-06, + "loss": 0.5507, + "step": 5575 + }, + { + "epoch": 0.71, + "grad_norm": 1.3982889652252197, + "learning_rate": 1.9910169512163634e-06, + "loss": 0.6258, + "step": 5576 + }, + { + "epoch": 0.71, + "grad_norm": 1.3766522407531738, + "learning_rate": 1.9893599904075485e-06, + "loss": 0.594, + "step": 5577 + }, + { + "epoch": 0.71, + "grad_norm": 1.1951097249984741, + "learning_rate": 1.987703548120281e-06, + "loss": 0.5951, + "step": 5578 + }, + { + "epoch": 0.71, + "grad_norm": 1.265201449394226, + "learning_rate": 1.9860476246398526e-06, + "loss": 0.6276, + "step": 5579 + }, + { + "epoch": 0.71, + "grad_norm": 2.538569927215576, + "learning_rate": 1.984392220251458e-06, + "loss": 0.6001, + "step": 5580 + }, + { + "epoch": 0.72, + "grad_norm": 1.507003903388977, + "learning_rate": 1.982737335240209e-06, + "loss": 0.6099, + "step": 5581 + }, + { + "epoch": 0.72, + "grad_norm": 1.2398253679275513, + "learning_rate": 1.9810829698911256e-06, + "loss": 0.5759, + "step": 5582 + }, + { + "epoch": 0.72, + "grad_norm": 1.2879706621170044, + "learning_rate": 1.9794291244891406e-06, + "loss": 0.5272, + "step": 5583 + }, + { + "epoch": 0.72, + "grad_norm": 1.5107345581054688, + "learning_rate": 1.977775799319092e-06, + "loss": 0.6362, + "step": 5584 + }, + { + "epoch": 0.72, + "grad_norm": 1.378507137298584, + "learning_rate": 1.976122994665733e-06, + "loss": 0.6266, + "step": 5585 + }, + { + "epoch": 0.72, + "grad_norm": 1.3231158256530762, + "learning_rate": 1.9744707108137257e-06, + "loss": 0.6275, + "step": 5586 + }, + { + "epoch": 0.72, + "grad_norm": 1.7076232433319092, + "learning_rate": 1.9728189480476436e-06, + "loss": 0.6385, + "step": 5587 + }, + { + "epoch": 0.72, + "grad_norm": 1.4472925662994385, + "learning_rate": 1.9711677066519645e-06, + "loss": 0.6796, + "step": 5588 + }, + { + "epoch": 0.72, + "grad_norm": 1.3523715734481812, + "learning_rate": 1.9695169869110864e-06, + "loss": 0.6968, + "step": 5589 + }, + { + "epoch": 0.72, + "grad_norm": 1.466336727142334, + "learning_rate": 1.967866789109313e-06, + "loss": 0.5789, + "step": 5590 + }, + { + "epoch": 0.72, + "grad_norm": 1.324497103691101, + "learning_rate": 1.9662171135308517e-06, + "loss": 0.5396, + "step": 5591 + }, + { + "epoch": 0.72, + "grad_norm": 1.0975810289382935, + "learning_rate": 1.9645679604598297e-06, + "loss": 0.5384, + "step": 5592 + }, + { + "epoch": 0.72, + "grad_norm": 1.4148670434951782, + "learning_rate": 1.9629193301802786e-06, + "loss": 0.5924, + "step": 5593 + }, + { + "epoch": 0.72, + "grad_norm": 1.2906337976455688, + "learning_rate": 1.9612712229761434e-06, + "loss": 0.6046, + "step": 5594 + }, + { + "epoch": 0.72, + "grad_norm": 1.1885030269622803, + "learning_rate": 1.959623639131274e-06, + "loss": 0.5818, + "step": 5595 + }, + { + "epoch": 0.72, + "grad_norm": 1.5290043354034424, + "learning_rate": 1.9579765789294354e-06, + "loss": 0.6608, + "step": 5596 + }, + { + "epoch": 0.72, + "grad_norm": 1.3768508434295654, + "learning_rate": 1.9563300426542992e-06, + "loss": 0.5653, + "step": 5597 + }, + { + "epoch": 0.72, + "grad_norm": 1.3716613054275513, + "learning_rate": 1.95468403058945e-06, + "loss": 0.666, + "step": 5598 + }, + { + "epoch": 0.72, + "grad_norm": 1.6025148630142212, + "learning_rate": 1.9530385430183763e-06, + "loss": 0.6309, + "step": 5599 + }, + { + "epoch": 0.72, + "grad_norm": 1.2713145017623901, + "learning_rate": 1.9513935802244816e-06, + "loss": 0.6173, + "step": 5600 + }, + { + "epoch": 0.72, + "grad_norm": 1.2051814794540405, + "learning_rate": 1.9497491424910774e-06, + "loss": 0.5658, + "step": 5601 + }, + { + "epoch": 0.72, + "grad_norm": 1.1207643747329712, + "learning_rate": 1.948105230101387e-06, + "loss": 0.5222, + "step": 5602 + }, + { + "epoch": 0.72, + "grad_norm": 1.8980365991592407, + "learning_rate": 1.9464618433385367e-06, + "loss": 0.5671, + "step": 5603 + }, + { + "epoch": 0.72, + "grad_norm": 1.4103658199310303, + "learning_rate": 1.9448189824855684e-06, + "loss": 0.6214, + "step": 5604 + }, + { + "epoch": 0.72, + "grad_norm": 1.3645200729370117, + "learning_rate": 1.9431766478254315e-06, + "loss": 0.5669, + "step": 5605 + }, + { + "epoch": 0.72, + "grad_norm": 1.2241871356964111, + "learning_rate": 1.941534839640985e-06, + "loss": 0.5364, + "step": 5606 + }, + { + "epoch": 0.72, + "grad_norm": 1.241102933883667, + "learning_rate": 1.9398935582149964e-06, + "loss": 0.6437, + "step": 5607 + }, + { + "epoch": 0.72, + "grad_norm": 2.057539939880371, + "learning_rate": 1.9382528038301437e-06, + "loss": 0.622, + "step": 5608 + }, + { + "epoch": 0.72, + "grad_norm": 1.2099688053131104, + "learning_rate": 1.9366125767690145e-06, + "loss": 0.5541, + "step": 5609 + }, + { + "epoch": 0.72, + "grad_norm": 1.6069209575653076, + "learning_rate": 1.9349728773141017e-06, + "loss": 0.5802, + "step": 5610 + }, + { + "epoch": 0.72, + "grad_norm": 1.1698077917099, + "learning_rate": 1.9333337057478123e-06, + "loss": 0.5953, + "step": 5611 + }, + { + "epoch": 0.72, + "grad_norm": 1.2990256547927856, + "learning_rate": 1.9316950623524595e-06, + "loss": 0.6457, + "step": 5612 + }, + { + "epoch": 0.72, + "grad_norm": 1.0718307495117188, + "learning_rate": 1.930056947410268e-06, + "loss": 0.5546, + "step": 5613 + }, + { + "epoch": 0.72, + "grad_norm": 1.0749974250793457, + "learning_rate": 1.928419361203367e-06, + "loss": 0.6744, + "step": 5614 + }, + { + "epoch": 0.72, + "grad_norm": 1.2324750423431396, + "learning_rate": 1.9267823040137983e-06, + "loss": 0.6432, + "step": 5615 + }, + { + "epoch": 0.72, + "grad_norm": 1.1767945289611816, + "learning_rate": 1.9251457761235127e-06, + "loss": 0.6066, + "step": 5616 + }, + { + "epoch": 0.72, + "grad_norm": 1.249362826347351, + "learning_rate": 1.92350977781437e-06, + "loss": 0.5928, + "step": 5617 + }, + { + "epoch": 0.72, + "grad_norm": 1.3877533674240112, + "learning_rate": 1.9218743093681346e-06, + "loss": 0.5676, + "step": 5618 + }, + { + "epoch": 0.72, + "grad_norm": 1.1825809478759766, + "learning_rate": 1.920239371066484e-06, + "loss": 0.6075, + "step": 5619 + }, + { + "epoch": 0.72, + "grad_norm": 1.3485913276672363, + "learning_rate": 1.9186049631910047e-06, + "loss": 0.616, + "step": 5620 + }, + { + "epoch": 0.72, + "grad_norm": 1.5328774452209473, + "learning_rate": 1.916971086023188e-06, + "loss": 0.6823, + "step": 5621 + }, + { + "epoch": 0.72, + "grad_norm": 1.6213551759719849, + "learning_rate": 1.915337739844434e-06, + "loss": 0.5982, + "step": 5622 + }, + { + "epoch": 0.72, + "grad_norm": 1.1525039672851562, + "learning_rate": 1.9137049249360583e-06, + "loss": 0.568, + "step": 5623 + }, + { + "epoch": 0.72, + "grad_norm": 1.1615984439849854, + "learning_rate": 1.9120726415792795e-06, + "loss": 0.6214, + "step": 5624 + }, + { + "epoch": 0.72, + "grad_norm": 1.1085628271102905, + "learning_rate": 1.9104408900552226e-06, + "loss": 0.6147, + "step": 5625 + }, + { + "epoch": 0.72, + "grad_norm": 1.7933579683303833, + "learning_rate": 1.9088096706449245e-06, + "loss": 0.6051, + "step": 5626 + }, + { + "epoch": 0.72, + "grad_norm": 1.3298068046569824, + "learning_rate": 1.9071789836293313e-06, + "loss": 0.5936, + "step": 5627 + }, + { + "epoch": 0.72, + "grad_norm": 1.3054797649383545, + "learning_rate": 1.9055488292892927e-06, + "loss": 0.6075, + "step": 5628 + }, + { + "epoch": 0.72, + "grad_norm": 1.7392698526382446, + "learning_rate": 1.9039192079055712e-06, + "loss": 0.6097, + "step": 5629 + }, + { + "epoch": 0.72, + "grad_norm": 1.3165106773376465, + "learning_rate": 1.902290119758836e-06, + "loss": 0.526, + "step": 5630 + }, + { + "epoch": 0.72, + "grad_norm": 1.294399380683899, + "learning_rate": 1.9006615651296662e-06, + "loss": 0.4764, + "step": 5631 + }, + { + "epoch": 0.72, + "grad_norm": 1.2079136371612549, + "learning_rate": 1.8990335442985436e-06, + "loss": 0.6266, + "step": 5632 + }, + { + "epoch": 0.72, + "grad_norm": 1.8871737718582153, + "learning_rate": 1.897406057545863e-06, + "loss": 0.6647, + "step": 5633 + }, + { + "epoch": 0.72, + "grad_norm": 1.2210822105407715, + "learning_rate": 1.895779105151927e-06, + "loss": 0.5954, + "step": 5634 + }, + { + "epoch": 0.72, + "grad_norm": 1.3488540649414062, + "learning_rate": 1.8941526873969463e-06, + "loss": 0.5809, + "step": 5635 + }, + { + "epoch": 0.72, + "grad_norm": 1.655834674835205, + "learning_rate": 1.8925268045610345e-06, + "loss": 0.5514, + "step": 5636 + }, + { + "epoch": 0.72, + "grad_norm": 1.7508749961853027, + "learning_rate": 1.890901456924219e-06, + "loss": 0.5246, + "step": 5637 + }, + { + "epoch": 0.72, + "grad_norm": 1.218916416168213, + "learning_rate": 1.8892766447664323e-06, + "loss": 0.5255, + "step": 5638 + }, + { + "epoch": 0.72, + "grad_norm": 1.2823469638824463, + "learning_rate": 1.8876523683675163e-06, + "loss": 0.6096, + "step": 5639 + }, + { + "epoch": 0.72, + "grad_norm": 1.282475233078003, + "learning_rate": 1.8860286280072187e-06, + "loss": 0.5822, + "step": 5640 + }, + { + "epoch": 0.72, + "grad_norm": 1.4252816438674927, + "learning_rate": 1.884405423965196e-06, + "loss": 0.5581, + "step": 5641 + }, + { + "epoch": 0.72, + "grad_norm": 1.3148558139801025, + "learning_rate": 1.8827827565210143e-06, + "loss": 0.5631, + "step": 5642 + }, + { + "epoch": 0.72, + "grad_norm": 1.354689359664917, + "learning_rate": 1.881160625954141e-06, + "loss": 0.5763, + "step": 5643 + }, + { + "epoch": 0.72, + "grad_norm": 1.8449751138687134, + "learning_rate": 1.8795390325439572e-06, + "loss": 0.6018, + "step": 5644 + }, + { + "epoch": 0.72, + "grad_norm": 1.3221594095230103, + "learning_rate": 1.8779179765697491e-06, + "loss": 0.6552, + "step": 5645 + }, + { + "epoch": 0.72, + "grad_norm": 1.4569640159606934, + "learning_rate": 1.8762974583107129e-06, + "loss": 0.5821, + "step": 5646 + }, + { + "epoch": 0.72, + "grad_norm": 1.3788806200027466, + "learning_rate": 1.8746774780459465e-06, + "loss": 0.5294, + "step": 5647 + }, + { + "epoch": 0.72, + "grad_norm": 1.5517175197601318, + "learning_rate": 1.8730580360544593e-06, + "loss": 0.6109, + "step": 5648 + }, + { + "epoch": 0.72, + "grad_norm": 1.6753023862838745, + "learning_rate": 1.8714391326151681e-06, + "loss": 0.5427, + "step": 5649 + }, + { + "epoch": 0.72, + "grad_norm": 1.5441654920578003, + "learning_rate": 1.8698207680068974e-06, + "loss": 0.6389, + "step": 5650 + }, + { + "epoch": 0.72, + "grad_norm": 1.1922584772109985, + "learning_rate": 1.8682029425083748e-06, + "loss": 0.5781, + "step": 5651 + }, + { + "epoch": 0.72, + "grad_norm": 1.3633989095687866, + "learning_rate": 1.8665856563982392e-06, + "loss": 0.5629, + "step": 5652 + }, + { + "epoch": 0.72, + "grad_norm": 1.3253165483474731, + "learning_rate": 1.864968909955035e-06, + "loss": 0.554, + "step": 5653 + }, + { + "epoch": 0.72, + "grad_norm": 1.3885520696640015, + "learning_rate": 1.8633527034572164e-06, + "loss": 0.6109, + "step": 5654 + }, + { + "epoch": 0.72, + "grad_norm": 1.899064302444458, + "learning_rate": 1.8617370371831373e-06, + "loss": 0.6412, + "step": 5655 + }, + { + "epoch": 0.72, + "grad_norm": 1.9599201679229736, + "learning_rate": 1.8601219114110646e-06, + "loss": 0.5653, + "step": 5656 + }, + { + "epoch": 0.72, + "grad_norm": 1.2851388454437256, + "learning_rate": 1.858507326419176e-06, + "loss": 0.5802, + "step": 5657 + }, + { + "epoch": 0.72, + "grad_norm": 1.6403940916061401, + "learning_rate": 1.8568932824855457e-06, + "loss": 0.5969, + "step": 5658 + }, + { + "epoch": 0.73, + "grad_norm": 1.4266154766082764, + "learning_rate": 1.8552797798881611e-06, + "loss": 0.554, + "step": 5659 + }, + { + "epoch": 0.73, + "grad_norm": 1.2188271284103394, + "learning_rate": 1.8536668189049156e-06, + "loss": 0.5139, + "step": 5660 + }, + { + "epoch": 0.73, + "grad_norm": 1.4466283321380615, + "learning_rate": 1.8520543998136104e-06, + "loss": 0.6892, + "step": 5661 + }, + { + "epoch": 0.73, + "grad_norm": 1.507656455039978, + "learning_rate": 1.8504425228919492e-06, + "loss": 0.6445, + "step": 5662 + }, + { + "epoch": 0.73, + "grad_norm": 1.2851141691207886, + "learning_rate": 1.8488311884175458e-06, + "loss": 0.69, + "step": 5663 + }, + { + "epoch": 0.73, + "grad_norm": 1.239793062210083, + "learning_rate": 1.8472203966679208e-06, + "loss": 0.65, + "step": 5664 + }, + { + "epoch": 0.73, + "grad_norm": 1.2325901985168457, + "learning_rate": 1.8456101479205012e-06, + "loss": 0.5284, + "step": 5665 + }, + { + "epoch": 0.73, + "grad_norm": 1.43928861618042, + "learning_rate": 1.8440004424526165e-06, + "loss": 0.6087, + "step": 5666 + }, + { + "epoch": 0.73, + "grad_norm": 1.4355437755584717, + "learning_rate": 1.8423912805415078e-06, + "loss": 0.6277, + "step": 5667 + }, + { + "epoch": 0.73, + "grad_norm": 1.8221622705459595, + "learning_rate": 1.84078266246432e-06, + "loss": 0.5687, + "step": 5668 + }, + { + "epoch": 0.73, + "grad_norm": 1.6198118925094604, + "learning_rate": 1.839174588498107e-06, + "loss": 0.5901, + "step": 5669 + }, + { + "epoch": 0.73, + "grad_norm": 1.7814929485321045, + "learning_rate": 1.837567058919823e-06, + "loss": 0.5777, + "step": 5670 + }, + { + "epoch": 0.73, + "grad_norm": 1.416741967201233, + "learning_rate": 1.8359600740063344e-06, + "loss": 0.5138, + "step": 5671 + }, + { + "epoch": 0.73, + "grad_norm": 1.3309919834136963, + "learning_rate": 1.8343536340344136e-06, + "loss": 0.5628, + "step": 5672 + }, + { + "epoch": 0.73, + "grad_norm": 1.2797962427139282, + "learning_rate": 1.8327477392807314e-06, + "loss": 0.5647, + "step": 5673 + }, + { + "epoch": 0.73, + "grad_norm": 1.2392196655273438, + "learning_rate": 1.831142390021876e-06, + "loss": 0.5267, + "step": 5674 + }, + { + "epoch": 0.73, + "grad_norm": 1.2258974313735962, + "learning_rate": 1.8295375865343363e-06, + "loss": 0.631, + "step": 5675 + }, + { + "epoch": 0.73, + "grad_norm": 1.6016523838043213, + "learning_rate": 1.8279333290945035e-06, + "loss": 0.6077, + "step": 5676 + }, + { + "epoch": 0.73, + "grad_norm": 1.2385517358779907, + "learning_rate": 1.8263296179786798e-06, + "loss": 0.6142, + "step": 5677 + }, + { + "epoch": 0.73, + "grad_norm": 1.3300591707229614, + "learning_rate": 1.8247264534630727e-06, + "loss": 0.5558, + "step": 5678 + }, + { + "epoch": 0.73, + "grad_norm": 1.3536458015441895, + "learning_rate": 1.8231238358237952e-06, + "loss": 0.5652, + "step": 5679 + }, + { + "epoch": 0.73, + "grad_norm": 1.294861078262329, + "learning_rate": 1.8215217653368632e-06, + "loss": 0.6216, + "step": 5680 + }, + { + "epoch": 0.73, + "grad_norm": 1.1894181966781616, + "learning_rate": 1.8199202422782026e-06, + "loss": 0.5569, + "step": 5681 + }, + { + "epoch": 0.73, + "grad_norm": 1.3334403038024902, + "learning_rate": 1.8183192669236422e-06, + "loss": 0.6186, + "step": 5682 + }, + { + "epoch": 0.73, + "grad_norm": 1.4797943830490112, + "learning_rate": 1.8167188395489194e-06, + "loss": 0.5627, + "step": 5683 + }, + { + "epoch": 0.73, + "grad_norm": 1.333863377571106, + "learning_rate": 1.8151189604296727e-06, + "loss": 0.6105, + "step": 5684 + }, + { + "epoch": 0.73, + "grad_norm": 1.8466845750808716, + "learning_rate": 1.8135196298414498e-06, + "loss": 0.7075, + "step": 5685 + }, + { + "epoch": 0.73, + "grad_norm": 1.2560466527938843, + "learning_rate": 1.8119208480597033e-06, + "loss": 0.6165, + "step": 5686 + }, + { + "epoch": 0.73, + "grad_norm": 1.3292875289916992, + "learning_rate": 1.8103226153597919e-06, + "loss": 0.6225, + "step": 5687 + }, + { + "epoch": 0.73, + "grad_norm": 2.3358840942382812, + "learning_rate": 1.8087249320169758e-06, + "loss": 0.6418, + "step": 5688 + }, + { + "epoch": 0.73, + "grad_norm": 1.621479868888855, + "learning_rate": 1.807127798306425e-06, + "loss": 0.535, + "step": 5689 + }, + { + "epoch": 0.73, + "grad_norm": 1.5514147281646729, + "learning_rate": 1.8055312145032139e-06, + "loss": 0.6259, + "step": 5690 + }, + { + "epoch": 0.73, + "grad_norm": 1.329399585723877, + "learning_rate": 1.8039351808823203e-06, + "loss": 0.56, + "step": 5691 + }, + { + "epoch": 0.73, + "grad_norm": 1.4586081504821777, + "learning_rate": 1.8023396977186297e-06, + "loss": 0.5567, + "step": 5692 + }, + { + "epoch": 0.73, + "grad_norm": 1.481757402420044, + "learning_rate": 1.8007447652869314e-06, + "loss": 0.6491, + "step": 5693 + }, + { + "epoch": 0.73, + "grad_norm": 1.280112624168396, + "learning_rate": 1.7991503838619219e-06, + "loss": 0.6109, + "step": 5694 + }, + { + "epoch": 0.73, + "grad_norm": 1.3548251390457153, + "learning_rate": 1.7975565537181971e-06, + "loss": 0.5796, + "step": 5695 + }, + { + "epoch": 0.73, + "grad_norm": 1.166089415550232, + "learning_rate": 1.7959632751302636e-06, + "loss": 0.5353, + "step": 5696 + }, + { + "epoch": 0.73, + "grad_norm": 1.4079132080078125, + "learning_rate": 1.794370548372532e-06, + "loss": 0.5729, + "step": 5697 + }, + { + "epoch": 0.73, + "grad_norm": 6.476421356201172, + "learning_rate": 1.7927783737193182e-06, + "loss": 0.6065, + "step": 5698 + }, + { + "epoch": 0.73, + "grad_norm": 1.1923269033432007, + "learning_rate": 1.7911867514448384e-06, + "loss": 0.614, + "step": 5699 + }, + { + "epoch": 0.73, + "grad_norm": 1.501766324043274, + "learning_rate": 1.7895956818232191e-06, + "loss": 0.6037, + "step": 5700 + }, + { + "epoch": 0.73, + "grad_norm": 1.4375709295272827, + "learning_rate": 1.7880051651284897e-06, + "loss": 0.6155, + "step": 5701 + }, + { + "epoch": 0.73, + "grad_norm": 1.1795188188552856, + "learning_rate": 1.786415201634586e-06, + "loss": 0.606, + "step": 5702 + }, + { + "epoch": 0.73, + "grad_norm": 1.210554599761963, + "learning_rate": 1.784825791615344e-06, + "loss": 0.5175, + "step": 5703 + }, + { + "epoch": 0.73, + "grad_norm": 1.2811566591262817, + "learning_rate": 1.7832369353445078e-06, + "loss": 0.5747, + "step": 5704 + }, + { + "epoch": 0.73, + "grad_norm": 1.2850680351257324, + "learning_rate": 1.7816486330957272e-06, + "loss": 0.6138, + "step": 5705 + }, + { + "epoch": 0.73, + "grad_norm": 1.2182974815368652, + "learning_rate": 1.780060885142555e-06, + "loss": 0.5599, + "step": 5706 + }, + { + "epoch": 0.73, + "grad_norm": 1.3909426927566528, + "learning_rate": 1.778473691758445e-06, + "loss": 0.5889, + "step": 5707 + }, + { + "epoch": 0.73, + "grad_norm": 1.6075366735458374, + "learning_rate": 1.7768870532167625e-06, + "loss": 0.5588, + "step": 5708 + }, + { + "epoch": 0.73, + "grad_norm": 1.3977075815200806, + "learning_rate": 1.7753009697907753e-06, + "loss": 0.5676, + "step": 5709 + }, + { + "epoch": 0.73, + "grad_norm": 1.174627661705017, + "learning_rate": 1.7737154417536495e-06, + "loss": 0.5714, + "step": 5710 + }, + { + "epoch": 0.73, + "grad_norm": 1.6572093963623047, + "learning_rate": 1.7721304693784624e-06, + "loss": 0.6938, + "step": 5711 + }, + { + "epoch": 0.73, + "grad_norm": 1.2030808925628662, + "learning_rate": 1.770546052938193e-06, + "loss": 0.5375, + "step": 5712 + }, + { + "epoch": 0.73, + "grad_norm": 1.3561108112335205, + "learning_rate": 1.7689621927057265e-06, + "loss": 0.5343, + "step": 5713 + }, + { + "epoch": 0.73, + "grad_norm": 1.1456682682037354, + "learning_rate": 1.7673788889538473e-06, + "loss": 0.5621, + "step": 5714 + }, + { + "epoch": 0.73, + "grad_norm": 1.3912715911865234, + "learning_rate": 1.7657961419552488e-06, + "loss": 0.5994, + "step": 5715 + }, + { + "epoch": 0.73, + "grad_norm": 1.5939741134643555, + "learning_rate": 1.7642139519825274e-06, + "loss": 0.5317, + "step": 5716 + }, + { + "epoch": 0.73, + "grad_norm": 1.3256257772445679, + "learning_rate": 1.7626323193081852e-06, + "loss": 0.5304, + "step": 5717 + }, + { + "epoch": 0.73, + "grad_norm": 1.559259057044983, + "learning_rate": 1.761051244204622e-06, + "loss": 0.6115, + "step": 5718 + }, + { + "epoch": 0.73, + "grad_norm": 1.6052910089492798, + "learning_rate": 1.7594707269441473e-06, + "loss": 0.5905, + "step": 5719 + }, + { + "epoch": 0.73, + "grad_norm": 1.3079288005828857, + "learning_rate": 1.757890767798976e-06, + "loss": 0.6405, + "step": 5720 + }, + { + "epoch": 0.73, + "grad_norm": 1.5783737897872925, + "learning_rate": 1.7563113670412197e-06, + "loss": 0.6213, + "step": 5721 + }, + { + "epoch": 0.73, + "grad_norm": 1.4406654834747314, + "learning_rate": 1.7547325249428999e-06, + "loss": 0.6082, + "step": 5722 + }, + { + "epoch": 0.73, + "grad_norm": 1.1159170866012573, + "learning_rate": 1.7531542417759401e-06, + "loss": 0.6584, + "step": 5723 + }, + { + "epoch": 0.73, + "grad_norm": 1.4534189701080322, + "learning_rate": 1.7515765178121696e-06, + "loss": 0.5921, + "step": 5724 + }, + { + "epoch": 0.73, + "grad_norm": 1.9966347217559814, + "learning_rate": 1.7499993533233134e-06, + "loss": 0.6042, + "step": 5725 + }, + { + "epoch": 0.73, + "grad_norm": 1.3243257999420166, + "learning_rate": 1.7484227485810119e-06, + "loss": 0.5952, + "step": 5726 + }, + { + "epoch": 0.73, + "grad_norm": 1.409643530845642, + "learning_rate": 1.7468467038568033e-06, + "loss": 0.5421, + "step": 5727 + }, + { + "epoch": 0.73, + "grad_norm": 1.2723798751831055, + "learning_rate": 1.7452712194221243e-06, + "loss": 0.647, + "step": 5728 + }, + { + "epoch": 0.73, + "grad_norm": 1.3207684755325317, + "learning_rate": 1.743696295548324e-06, + "loss": 0.5333, + "step": 5729 + }, + { + "epoch": 0.73, + "grad_norm": 1.3573734760284424, + "learning_rate": 1.7421219325066495e-06, + "loss": 0.5481, + "step": 5730 + }, + { + "epoch": 0.73, + "grad_norm": 1.479612946510315, + "learning_rate": 1.7405481305682547e-06, + "loss": 0.5887, + "step": 5731 + }, + { + "epoch": 0.73, + "grad_norm": 1.5791213512420654, + "learning_rate": 1.7389748900041926e-06, + "loss": 0.5848, + "step": 5732 + }, + { + "epoch": 0.73, + "grad_norm": 1.5573316812515259, + "learning_rate": 1.7374022110854222e-06, + "loss": 0.6232, + "step": 5733 + }, + { + "epoch": 0.73, + "grad_norm": 1.2642827033996582, + "learning_rate": 1.7358300940828067e-06, + "loss": 0.5412, + "step": 5734 + }, + { + "epoch": 0.73, + "grad_norm": 1.6646677255630493, + "learning_rate": 1.7342585392671117e-06, + "loss": 0.5998, + "step": 5735 + }, + { + "epoch": 0.73, + "grad_norm": 1.2101836204528809, + "learning_rate": 1.7326875469090037e-06, + "loss": 0.582, + "step": 5736 + }, + { + "epoch": 0.73, + "grad_norm": 1.327369213104248, + "learning_rate": 1.7311171172790548e-06, + "loss": 0.5807, + "step": 5737 + }, + { + "epoch": 0.74, + "grad_norm": 1.1370022296905518, + "learning_rate": 1.72954725064774e-06, + "loss": 0.503, + "step": 5738 + }, + { + "epoch": 0.74, + "grad_norm": 1.4957923889160156, + "learning_rate": 1.7279779472854386e-06, + "loss": 0.5385, + "step": 5739 + }, + { + "epoch": 0.74, + "grad_norm": 1.2059375047683716, + "learning_rate": 1.7264092074624278e-06, + "loss": 0.5874, + "step": 5740 + }, + { + "epoch": 0.74, + "grad_norm": 1.1997199058532715, + "learning_rate": 1.7248410314488928e-06, + "loss": 0.5358, + "step": 5741 + }, + { + "epoch": 0.74, + "grad_norm": 1.4809540510177612, + "learning_rate": 1.7232734195149197e-06, + "loss": 0.5952, + "step": 5742 + }, + { + "epoch": 0.74, + "grad_norm": 1.9461658000946045, + "learning_rate": 1.7217063719304988e-06, + "loss": 0.6296, + "step": 5743 + }, + { + "epoch": 0.74, + "grad_norm": 1.942325472831726, + "learning_rate": 1.7201398889655207e-06, + "loss": 0.6345, + "step": 5744 + }, + { + "epoch": 0.74, + "grad_norm": 1.3077179193496704, + "learning_rate": 1.718573970889782e-06, + "loss": 0.7035, + "step": 5745 + }, + { + "epoch": 0.74, + "grad_norm": 1.5481005907058716, + "learning_rate": 1.71700861797298e-06, + "loss": 0.5672, + "step": 5746 + }, + { + "epoch": 0.74, + "grad_norm": 1.3350780010223389, + "learning_rate": 1.7154438304847132e-06, + "loss": 0.5974, + "step": 5747 + }, + { + "epoch": 0.74, + "grad_norm": 1.4484832286834717, + "learning_rate": 1.7138796086944854e-06, + "loss": 0.6045, + "step": 5748 + }, + { + "epoch": 0.74, + "grad_norm": 1.4450817108154297, + "learning_rate": 1.712315952871702e-06, + "loss": 0.5568, + "step": 5749 + }, + { + "epoch": 0.74, + "grad_norm": 1.2990732192993164, + "learning_rate": 1.7107528632856729e-06, + "loss": 0.541, + "step": 5750 + }, + { + "epoch": 0.74, + "grad_norm": 1.9606150388717651, + "learning_rate": 1.7091903402056054e-06, + "loss": 0.584, + "step": 5751 + }, + { + "epoch": 0.74, + "grad_norm": 1.5782921314239502, + "learning_rate": 1.707628383900613e-06, + "loss": 0.5366, + "step": 5752 + }, + { + "epoch": 0.74, + "grad_norm": 1.3492310047149658, + "learning_rate": 1.7060669946397112e-06, + "loss": 0.6352, + "step": 5753 + }, + { + "epoch": 0.74, + "grad_norm": 1.2007477283477783, + "learning_rate": 1.7045061726918204e-06, + "loss": 0.5359, + "step": 5754 + }, + { + "epoch": 0.74, + "grad_norm": 1.260575771331787, + "learning_rate": 1.7029459183257557e-06, + "loss": 0.5743, + "step": 5755 + }, + { + "epoch": 0.74, + "grad_norm": 1.4031670093536377, + "learning_rate": 1.7013862318102415e-06, + "loss": 0.5996, + "step": 5756 + }, + { + "epoch": 0.74, + "grad_norm": 1.4298655986785889, + "learning_rate": 1.6998271134139022e-06, + "loss": 0.627, + "step": 5757 + }, + { + "epoch": 0.74, + "grad_norm": 2.1103055477142334, + "learning_rate": 1.698268563405266e-06, + "loss": 0.5912, + "step": 5758 + }, + { + "epoch": 0.74, + "grad_norm": 1.184146761894226, + "learning_rate": 1.6967105820527558e-06, + "loss": 0.6081, + "step": 5759 + }, + { + "epoch": 0.74, + "grad_norm": 1.3072179555892944, + "learning_rate": 1.6951531696247075e-06, + "loss": 0.6095, + "step": 5760 + }, + { + "epoch": 0.74, + "grad_norm": 1.2722811698913574, + "learning_rate": 1.6935963263893546e-06, + "loss": 0.4975, + "step": 5761 + }, + { + "epoch": 0.74, + "grad_norm": 1.443372368812561, + "learning_rate": 1.6920400526148272e-06, + "loss": 0.6409, + "step": 5762 + }, + { + "epoch": 0.74, + "grad_norm": 1.277919054031372, + "learning_rate": 1.6904843485691635e-06, + "loss": 0.6582, + "step": 5763 + }, + { + "epoch": 0.74, + "grad_norm": 1.1344646215438843, + "learning_rate": 1.6889292145203028e-06, + "loss": 0.5916, + "step": 5764 + }, + { + "epoch": 0.74, + "grad_norm": 1.1989878416061401, + "learning_rate": 1.6873746507360865e-06, + "loss": 0.4848, + "step": 5765 + }, + { + "epoch": 0.74, + "grad_norm": 1.2986321449279785, + "learning_rate": 1.685820657484254e-06, + "loss": 0.6413, + "step": 5766 + }, + { + "epoch": 0.74, + "grad_norm": 1.6236834526062012, + "learning_rate": 1.684267235032449e-06, + "loss": 0.6122, + "step": 5767 + }, + { + "epoch": 0.74, + "grad_norm": 1.3555066585540771, + "learning_rate": 1.68271438364822e-06, + "loss": 0.5595, + "step": 5768 + }, + { + "epoch": 0.74, + "grad_norm": 1.2782316207885742, + "learning_rate": 1.6811621035990106e-06, + "loss": 0.7395, + "step": 5769 + }, + { + "epoch": 0.74, + "grad_norm": 1.4683347940444946, + "learning_rate": 1.6796103951521708e-06, + "loss": 0.5275, + "step": 5770 + }, + { + "epoch": 0.74, + "grad_norm": 1.3068945407867432, + "learning_rate": 1.678059258574951e-06, + "loss": 0.6087, + "step": 5771 + }, + { + "epoch": 0.74, + "grad_norm": 1.8507400751113892, + "learning_rate": 1.676508694134505e-06, + "loss": 0.5883, + "step": 5772 + }, + { + "epoch": 0.74, + "grad_norm": 1.1494866609573364, + "learning_rate": 1.6749587020978814e-06, + "loss": 0.5752, + "step": 5773 + }, + { + "epoch": 0.74, + "grad_norm": 1.5824859142303467, + "learning_rate": 1.673409282732038e-06, + "loss": 0.5977, + "step": 5774 + }, + { + "epoch": 0.74, + "grad_norm": 1.35085129737854, + "learning_rate": 1.67186043630383e-06, + "loss": 0.5602, + "step": 5775 + }, + { + "epoch": 0.74, + "grad_norm": 1.6015706062316895, + "learning_rate": 1.670312163080015e-06, + "loss": 0.598, + "step": 5776 + }, + { + "epoch": 0.74, + "grad_norm": 1.6979427337646484, + "learning_rate": 1.6687644633272516e-06, + "loss": 0.6116, + "step": 5777 + }, + { + "epoch": 0.74, + "grad_norm": 1.282360315322876, + "learning_rate": 1.6672173373120992e-06, + "loss": 0.5501, + "step": 5778 + }, + { + "epoch": 0.74, + "grad_norm": 1.518515944480896, + "learning_rate": 1.6656707853010207e-06, + "loss": 0.564, + "step": 5779 + }, + { + "epoch": 0.74, + "grad_norm": 1.1478134393692017, + "learning_rate": 1.6641248075603756e-06, + "loss": 0.7633, + "step": 5780 + }, + { + "epoch": 0.74, + "grad_norm": 1.7522804737091064, + "learning_rate": 1.6625794043564275e-06, + "loss": 0.517, + "step": 5781 + }, + { + "epoch": 0.74, + "grad_norm": 1.3169957399368286, + "learning_rate": 1.661034575955342e-06, + "loss": 0.6518, + "step": 5782 + }, + { + "epoch": 0.74, + "grad_norm": 3.0078976154327393, + "learning_rate": 1.659490322623185e-06, + "loss": 0.5437, + "step": 5783 + }, + { + "epoch": 0.74, + "grad_norm": 1.5404016971588135, + "learning_rate": 1.6579466446259201e-06, + "loss": 0.6104, + "step": 5784 + }, + { + "epoch": 0.74, + "grad_norm": 1.531265377998352, + "learning_rate": 1.6564035422294156e-06, + "loss": 0.6512, + "step": 5785 + }, + { + "epoch": 0.74, + "grad_norm": 1.444911241531372, + "learning_rate": 1.65486101569944e-06, + "loss": 0.5998, + "step": 5786 + }, + { + "epoch": 0.74, + "grad_norm": 1.3478665351867676, + "learning_rate": 1.653319065301664e-06, + "loss": 0.6803, + "step": 5787 + }, + { + "epoch": 0.74, + "grad_norm": 1.3380131721496582, + "learning_rate": 1.651777691301653e-06, + "loss": 0.6165, + "step": 5788 + }, + { + "epoch": 0.74, + "grad_norm": 1.335648536682129, + "learning_rate": 1.6502368939648794e-06, + "loss": 0.6804, + "step": 5789 + }, + { + "epoch": 0.74, + "grad_norm": 1.409081220626831, + "learning_rate": 1.6486966735567144e-06, + "loss": 0.5307, + "step": 5790 + }, + { + "epoch": 0.74, + "grad_norm": 1.3683109283447266, + "learning_rate": 1.6471570303424318e-06, + "loss": 0.5651, + "step": 5791 + }, + { + "epoch": 0.74, + "grad_norm": 1.4809428453445435, + "learning_rate": 1.6456179645871996e-06, + "loss": 0.5627, + "step": 5792 + }, + { + "epoch": 0.74, + "grad_norm": 1.6425578594207764, + "learning_rate": 1.6440794765560913e-06, + "loss": 0.6125, + "step": 5793 + }, + { + "epoch": 0.74, + "grad_norm": 1.5959701538085938, + "learning_rate": 1.6425415665140842e-06, + "loss": 0.507, + "step": 5794 + }, + { + "epoch": 0.74, + "grad_norm": 1.0912171602249146, + "learning_rate": 1.6410042347260486e-06, + "loss": 0.6124, + "step": 5795 + }, + { + "epoch": 0.74, + "grad_norm": 1.6113317012786865, + "learning_rate": 1.6394674814567585e-06, + "loss": 0.5317, + "step": 5796 + }, + { + "epoch": 0.74, + "grad_norm": 1.3795167207717896, + "learning_rate": 1.6379313069708896e-06, + "loss": 0.665, + "step": 5797 + }, + { + "epoch": 0.74, + "grad_norm": 1.1050100326538086, + "learning_rate": 1.6363957115330187e-06, + "loss": 0.6964, + "step": 5798 + }, + { + "epoch": 0.74, + "grad_norm": 1.3911715745925903, + "learning_rate": 1.6348606954076169e-06, + "loss": 0.6012, + "step": 5799 + }, + { + "epoch": 0.74, + "grad_norm": 1.1959513425827026, + "learning_rate": 1.6333262588590609e-06, + "loss": 0.7361, + "step": 5800 + }, + { + "epoch": 0.74, + "grad_norm": 1.2428171634674072, + "learning_rate": 1.631792402151627e-06, + "loss": 0.62, + "step": 5801 + }, + { + "epoch": 0.74, + "grad_norm": 2.339919328689575, + "learning_rate": 1.6302591255494916e-06, + "loss": 0.6668, + "step": 5802 + }, + { + "epoch": 0.74, + "grad_norm": 1.3819876909255981, + "learning_rate": 1.6287264293167277e-06, + "loss": 0.6304, + "step": 5803 + }, + { + "epoch": 0.74, + "grad_norm": 1.578447937965393, + "learning_rate": 1.627194313717313e-06, + "loss": 0.6149, + "step": 5804 + }, + { + "epoch": 0.74, + "grad_norm": 1.1103376150131226, + "learning_rate": 1.6256627790151224e-06, + "loss": 0.5619, + "step": 5805 + }, + { + "epoch": 0.74, + "grad_norm": 1.9362802505493164, + "learning_rate": 1.6241318254739346e-06, + "loss": 0.5938, + "step": 5806 + }, + { + "epoch": 0.74, + "grad_norm": 1.4348499774932861, + "learning_rate": 1.6226014533574208e-06, + "loss": 0.6009, + "step": 5807 + }, + { + "epoch": 0.74, + "grad_norm": 1.4850698709487915, + "learning_rate": 1.6210716629291584e-06, + "loss": 0.6183, + "step": 5808 + }, + { + "epoch": 0.74, + "grad_norm": 1.44414484500885, + "learning_rate": 1.6195424544526228e-06, + "loss": 0.6009, + "step": 5809 + }, + { + "epoch": 0.74, + "grad_norm": 1.3758606910705566, + "learning_rate": 1.6180138281911889e-06, + "loss": 0.6444, + "step": 5810 + }, + { + "epoch": 0.74, + "grad_norm": 1.1687239408493042, + "learning_rate": 1.6164857844081316e-06, + "loss": 0.4735, + "step": 5811 + }, + { + "epoch": 0.74, + "grad_norm": 1.3271052837371826, + "learning_rate": 1.6149583233666254e-06, + "loss": 0.6004, + "step": 5812 + }, + { + "epoch": 0.74, + "grad_norm": 1.2638598680496216, + "learning_rate": 1.6134314453297467e-06, + "loss": 0.7543, + "step": 5813 + }, + { + "epoch": 0.74, + "grad_norm": 1.4526175260543823, + "learning_rate": 1.611905150560465e-06, + "loss": 0.5601, + "step": 5814 + }, + { + "epoch": 0.74, + "grad_norm": 1.159395694732666, + "learning_rate": 1.6103794393216554e-06, + "loss": 0.5431, + "step": 5815 + }, + { + "epoch": 0.75, + "grad_norm": 1.4904710054397583, + "learning_rate": 1.6088543118760908e-06, + "loss": 0.5712, + "step": 5816 + }, + { + "epoch": 0.75, + "grad_norm": 1.4881311655044556, + "learning_rate": 1.6073297684864453e-06, + "loss": 0.5665, + "step": 5817 + }, + { + "epoch": 0.75, + "grad_norm": 2.3589932918548584, + "learning_rate": 1.6058058094152862e-06, + "loss": 0.6032, + "step": 5818 + }, + { + "epoch": 0.75, + "grad_norm": 1.4021669626235962, + "learning_rate": 1.6042824349250873e-06, + "loss": 0.6179, + "step": 5819 + }, + { + "epoch": 0.75, + "grad_norm": 1.3111644983291626, + "learning_rate": 1.6027596452782202e-06, + "loss": 0.5615, + "step": 5820 + }, + { + "epoch": 0.75, + "grad_norm": 1.3525159358978271, + "learning_rate": 1.6012374407369514e-06, + "loss": 0.5584, + "step": 5821 + }, + { + "epoch": 0.75, + "grad_norm": 1.5664325952529907, + "learning_rate": 1.5997158215634506e-06, + "loss": 0.5334, + "step": 5822 + }, + { + "epoch": 0.75, + "grad_norm": 1.4455913305282593, + "learning_rate": 1.5981947880197862e-06, + "loss": 0.6552, + "step": 5823 + }, + { + "epoch": 0.75, + "grad_norm": 1.6153526306152344, + "learning_rate": 1.596674340367927e-06, + "loss": 0.5421, + "step": 5824 + }, + { + "epoch": 0.75, + "grad_norm": 1.32578706741333, + "learning_rate": 1.5951544788697354e-06, + "loss": 0.621, + "step": 5825 + }, + { + "epoch": 0.75, + "grad_norm": 1.0666056871414185, + "learning_rate": 1.5936352037869796e-06, + "loss": 0.6811, + "step": 5826 + }, + { + "epoch": 0.75, + "grad_norm": 1.09562349319458, + "learning_rate": 1.5921165153813222e-06, + "loss": 0.6983, + "step": 5827 + }, + { + "epoch": 0.75, + "grad_norm": 1.482304573059082, + "learning_rate": 1.5905984139143277e-06, + "loss": 0.5927, + "step": 5828 + }, + { + "epoch": 0.75, + "grad_norm": 1.3417950868606567, + "learning_rate": 1.5890808996474576e-06, + "loss": 0.593, + "step": 5829 + }, + { + "epoch": 0.75, + "grad_norm": 1.6582002639770508, + "learning_rate": 1.5875639728420727e-06, + "loss": 0.5227, + "step": 5830 + }, + { + "epoch": 0.75, + "grad_norm": 1.452853798866272, + "learning_rate": 1.586047633759435e-06, + "loss": 0.6726, + "step": 5831 + }, + { + "epoch": 0.75, + "grad_norm": 1.5506370067596436, + "learning_rate": 1.5845318826606997e-06, + "loss": 0.5796, + "step": 5832 + }, + { + "epoch": 0.75, + "grad_norm": 1.3531577587127686, + "learning_rate": 1.5830167198069256e-06, + "loss": 0.6651, + "step": 5833 + }, + { + "epoch": 0.75, + "grad_norm": 1.3325982093811035, + "learning_rate": 1.581502145459069e-06, + "loss": 0.6107, + "step": 5834 + }, + { + "epoch": 0.75, + "grad_norm": 1.6698253154754639, + "learning_rate": 1.5799881598779853e-06, + "loss": 0.6907, + "step": 5835 + }, + { + "epoch": 0.75, + "grad_norm": 1.4126887321472168, + "learning_rate": 1.5784747633244257e-06, + "loss": 0.6099, + "step": 5836 + }, + { + "epoch": 0.75, + "grad_norm": 1.450058937072754, + "learning_rate": 1.5769619560590426e-06, + "loss": 0.6664, + "step": 5837 + }, + { + "epoch": 0.75, + "grad_norm": 4.739638805389404, + "learning_rate": 1.5754497383423866e-06, + "loss": 0.5682, + "step": 5838 + }, + { + "epoch": 0.75, + "grad_norm": 1.2308710813522339, + "learning_rate": 1.573938110434909e-06, + "loss": 0.568, + "step": 5839 + }, + { + "epoch": 0.75, + "grad_norm": 1.7075868844985962, + "learning_rate": 1.5724270725969521e-06, + "loss": 0.5979, + "step": 5840 + }, + { + "epoch": 0.75, + "grad_norm": 1.3854655027389526, + "learning_rate": 1.570916625088764e-06, + "loss": 0.5529, + "step": 5841 + }, + { + "epoch": 0.75, + "grad_norm": 1.426127314567566, + "learning_rate": 1.5694067681704888e-06, + "loss": 0.5614, + "step": 5842 + }, + { + "epoch": 0.75, + "grad_norm": 1.2803055047988892, + "learning_rate": 1.5678975021021703e-06, + "loss": 0.6027, + "step": 5843 + }, + { + "epoch": 0.75, + "grad_norm": 1.465036392211914, + "learning_rate": 1.5663888271437434e-06, + "loss": 0.6433, + "step": 5844 + }, + { + "epoch": 0.75, + "grad_norm": 1.2361197471618652, + "learning_rate": 1.5648807435550518e-06, + "loss": 0.643, + "step": 5845 + }, + { + "epoch": 0.75, + "grad_norm": 1.3847153186798096, + "learning_rate": 1.5633732515958322e-06, + "loss": 0.6341, + "step": 5846 + }, + { + "epoch": 0.75, + "grad_norm": 1.526636004447937, + "learning_rate": 1.5618663515257166e-06, + "loss": 0.5841, + "step": 5847 + }, + { + "epoch": 0.75, + "grad_norm": 1.0692555904388428, + "learning_rate": 1.5603600436042393e-06, + "loss": 0.6289, + "step": 5848 + }, + { + "epoch": 0.75, + "grad_norm": 1.4534244537353516, + "learning_rate": 1.5588543280908309e-06, + "loss": 0.5165, + "step": 5849 + }, + { + "epoch": 0.75, + "grad_norm": 1.500308632850647, + "learning_rate": 1.5573492052448226e-06, + "loss": 0.6331, + "step": 5850 + }, + { + "epoch": 0.75, + "grad_norm": 1.619732141494751, + "learning_rate": 1.5558446753254374e-06, + "loss": 0.6026, + "step": 5851 + }, + { + "epoch": 0.75, + "grad_norm": 1.5240439176559448, + "learning_rate": 1.5543407385918019e-06, + "loss": 0.5902, + "step": 5852 + }, + { + "epoch": 0.75, + "grad_norm": 1.284821629524231, + "learning_rate": 1.5528373953029386e-06, + "loss": 0.5572, + "step": 5853 + }, + { + "epoch": 0.75, + "grad_norm": 1.4876408576965332, + "learning_rate": 1.5513346457177692e-06, + "loss": 0.55, + "step": 5854 + }, + { + "epoch": 0.75, + "grad_norm": 1.397421956062317, + "learning_rate": 1.5498324900951083e-06, + "loss": 0.5822, + "step": 5855 + }, + { + "epoch": 0.75, + "grad_norm": 1.2312592267990112, + "learning_rate": 1.5483309286936743e-06, + "loss": 0.5442, + "step": 5856 + }, + { + "epoch": 0.75, + "grad_norm": 1.2165515422821045, + "learning_rate": 1.54682996177208e-06, + "loss": 0.6374, + "step": 5857 + }, + { + "epoch": 0.75, + "grad_norm": 1.3601298332214355, + "learning_rate": 1.5453295895888382e-06, + "loss": 0.5892, + "step": 5858 + }, + { + "epoch": 0.75, + "grad_norm": 1.664817214012146, + "learning_rate": 1.5438298124023537e-06, + "loss": 0.5797, + "step": 5859 + }, + { + "epoch": 0.75, + "grad_norm": 2.5244088172912598, + "learning_rate": 1.542330630470935e-06, + "loss": 0.4757, + "step": 5860 + }, + { + "epoch": 0.75, + "grad_norm": 1.2353243827819824, + "learning_rate": 1.5408320440527874e-06, + "loss": 0.5268, + "step": 5861 + }, + { + "epoch": 0.75, + "grad_norm": 1.2477408647537231, + "learning_rate": 1.5393340534060064e-06, + "loss": 0.6622, + "step": 5862 + }, + { + "epoch": 0.75, + "grad_norm": 1.1838089227676392, + "learning_rate": 1.5378366587885956e-06, + "loss": 0.5516, + "step": 5863 + }, + { + "epoch": 0.75, + "grad_norm": 1.4416934251785278, + "learning_rate": 1.5363398604584496e-06, + "loss": 0.5599, + "step": 5864 + }, + { + "epoch": 0.75, + "grad_norm": 1.2726426124572754, + "learning_rate": 1.5348436586733623e-06, + "loss": 0.5677, + "step": 5865 + }, + { + "epoch": 0.75, + "grad_norm": 1.5306282043457031, + "learning_rate": 1.5333480536910217e-06, + "loss": 0.6393, + "step": 5866 + }, + { + "epoch": 0.75, + "grad_norm": 1.202629566192627, + "learning_rate": 1.5318530457690163e-06, + "loss": 0.6512, + "step": 5867 + }, + { + "epoch": 0.75, + "grad_norm": 1.521677851676941, + "learning_rate": 1.5303586351648326e-06, + "loss": 0.6393, + "step": 5868 + }, + { + "epoch": 0.75, + "grad_norm": 1.428450107574463, + "learning_rate": 1.5288648221358488e-06, + "loss": 0.6125, + "step": 5869 + }, + { + "epoch": 0.75, + "grad_norm": 1.4109379053115845, + "learning_rate": 1.5273716069393467e-06, + "loss": 0.5075, + "step": 5870 + }, + { + "epoch": 0.75, + "grad_norm": 1.4165418148040771, + "learning_rate": 1.525878989832501e-06, + "loss": 0.6775, + "step": 5871 + }, + { + "epoch": 0.75, + "grad_norm": 1.4802438020706177, + "learning_rate": 1.5243869710723875e-06, + "loss": 0.6267, + "step": 5872 + }, + { + "epoch": 0.75, + "grad_norm": 1.338913917541504, + "learning_rate": 1.5228955509159715e-06, + "loss": 0.6234, + "step": 5873 + }, + { + "epoch": 0.75, + "grad_norm": 1.6580629348754883, + "learning_rate": 1.5214047296201218e-06, + "loss": 0.5374, + "step": 5874 + }, + { + "epoch": 0.75, + "grad_norm": 1.2051814794540405, + "learning_rate": 1.5199145074416028e-06, + "loss": 0.5343, + "step": 5875 + }, + { + "epoch": 0.75, + "grad_norm": 1.2296867370605469, + "learning_rate": 1.5184248846370763e-06, + "loss": 0.6521, + "step": 5876 + }, + { + "epoch": 0.75, + "grad_norm": 2.1033120155334473, + "learning_rate": 1.5169358614630958e-06, + "loss": 0.66, + "step": 5877 + }, + { + "epoch": 0.75, + "grad_norm": 1.2728708982467651, + "learning_rate": 1.5154474381761175e-06, + "loss": 0.6069, + "step": 5878 + }, + { + "epoch": 0.75, + "grad_norm": 1.303300380706787, + "learning_rate": 1.5139596150324915e-06, + "loss": 0.5893, + "step": 5879 + }, + { + "epoch": 0.75, + "grad_norm": 1.422898530960083, + "learning_rate": 1.5124723922884655e-06, + "loss": 0.5823, + "step": 5880 + }, + { + "epoch": 0.75, + "grad_norm": 1.917449712753296, + "learning_rate": 1.5109857702001834e-06, + "loss": 0.6907, + "step": 5881 + }, + { + "epoch": 0.75, + "grad_norm": 1.2992802858352661, + "learning_rate": 1.5094997490236857e-06, + "loss": 0.6293, + "step": 5882 + }, + { + "epoch": 0.75, + "grad_norm": 1.5122466087341309, + "learning_rate": 1.50801432901491e-06, + "loss": 0.6306, + "step": 5883 + }, + { + "epoch": 0.75, + "grad_norm": 1.4652949571609497, + "learning_rate": 1.5065295104296884e-06, + "loss": 0.5773, + "step": 5884 + }, + { + "epoch": 0.75, + "grad_norm": 1.2318650484085083, + "learning_rate": 1.5050452935237502e-06, + "loss": 0.5216, + "step": 5885 + }, + { + "epoch": 0.75, + "grad_norm": 1.5187735557556152, + "learning_rate": 1.5035616785527235e-06, + "loss": 0.662, + "step": 5886 + }, + { + "epoch": 0.75, + "grad_norm": 1.2845804691314697, + "learning_rate": 1.5020786657721309e-06, + "loss": 0.5735, + "step": 5887 + }, + { + "epoch": 0.75, + "grad_norm": 1.2593872547149658, + "learning_rate": 1.5005962554373887e-06, + "loss": 0.5358, + "step": 5888 + }, + { + "epoch": 0.75, + "grad_norm": 1.33612060546875, + "learning_rate": 1.4991144478038133e-06, + "loss": 0.5702, + "step": 5889 + }, + { + "epoch": 0.75, + "grad_norm": 1.3789081573486328, + "learning_rate": 1.497633243126616e-06, + "loss": 0.5369, + "step": 5890 + }, + { + "epoch": 0.75, + "grad_norm": 4.179154396057129, + "learning_rate": 1.4961526416609062e-06, + "loss": 0.6509, + "step": 5891 + }, + { + "epoch": 0.75, + "grad_norm": 1.2960028648376465, + "learning_rate": 1.4946726436616844e-06, + "loss": 0.4924, + "step": 5892 + }, + { + "epoch": 0.75, + "grad_norm": 1.3278669118881226, + "learning_rate": 1.493193249383851e-06, + "loss": 0.6282, + "step": 5893 + }, + { + "epoch": 0.76, + "grad_norm": 1.4293935298919678, + "learning_rate": 1.4917144590822015e-06, + "loss": 0.6909, + "step": 5894 + }, + { + "epoch": 0.76, + "grad_norm": 1.6118907928466797, + "learning_rate": 1.4902362730114296e-06, + "loss": 0.6674, + "step": 5895 + }, + { + "epoch": 0.76, + "grad_norm": 1.550476312637329, + "learning_rate": 1.4887586914261175e-06, + "loss": 0.5613, + "step": 5896 + }, + { + "epoch": 0.76, + "grad_norm": 1.3202171325683594, + "learning_rate": 1.4872817145807545e-06, + "loss": 0.5946, + "step": 5897 + }, + { + "epoch": 0.76, + "grad_norm": 1.505103588104248, + "learning_rate": 1.4858053427297186e-06, + "loss": 0.5916, + "step": 5898 + }, + { + "epoch": 0.76, + "grad_norm": 1.4744017124176025, + "learning_rate": 1.4843295761272824e-06, + "loss": 0.6262, + "step": 5899 + }, + { + "epoch": 0.76, + "grad_norm": 1.3989981412887573, + "learning_rate": 1.482854415027618e-06, + "loss": 0.6148, + "step": 5900 + }, + { + "epoch": 0.76, + "grad_norm": 1.4221875667572021, + "learning_rate": 1.4813798596847916e-06, + "loss": 0.57, + "step": 5901 + }, + { + "epoch": 0.76, + "grad_norm": 1.3431564569473267, + "learning_rate": 1.479905910352768e-06, + "loss": 0.5682, + "step": 5902 + }, + { + "epoch": 0.76, + "grad_norm": 1.2961328029632568, + "learning_rate": 1.4784325672854005e-06, + "loss": 0.6361, + "step": 5903 + }, + { + "epoch": 0.76, + "grad_norm": 1.5604289770126343, + "learning_rate": 1.4769598307364452e-06, + "loss": 0.6184, + "step": 5904 + }, + { + "epoch": 0.76, + "grad_norm": 1.340232014656067, + "learning_rate": 1.4754877009595509e-06, + "loss": 0.5844, + "step": 5905 + }, + { + "epoch": 0.76, + "grad_norm": 1.2040913105010986, + "learning_rate": 1.4740161782082623e-06, + "loss": 0.6196, + "step": 5906 + }, + { + "epoch": 0.76, + "grad_norm": 1.2919450998306274, + "learning_rate": 1.4725452627360182e-06, + "loss": 0.623, + "step": 5907 + }, + { + "epoch": 0.76, + "grad_norm": 1.9808508157730103, + "learning_rate": 1.471074954796154e-06, + "loss": 0.6147, + "step": 5908 + }, + { + "epoch": 0.76, + "grad_norm": 1.3140724897384644, + "learning_rate": 1.4696052546419005e-06, + "loss": 0.5737, + "step": 5909 + }, + { + "epoch": 0.76, + "grad_norm": 1.031933069229126, + "learning_rate": 1.4681361625263857e-06, + "loss": 0.6697, + "step": 5910 + }, + { + "epoch": 0.76, + "grad_norm": 1.5908561944961548, + "learning_rate": 1.4666676787026273e-06, + "loss": 0.6361, + "step": 5911 + }, + { + "epoch": 0.76, + "grad_norm": 1.4421895742416382, + "learning_rate": 1.4651998034235438e-06, + "loss": 0.6496, + "step": 5912 + }, + { + "epoch": 0.76, + "grad_norm": 2.0249710083007812, + "learning_rate": 1.463732536941947e-06, + "loss": 0.6466, + "step": 5913 + }, + { + "epoch": 0.76, + "grad_norm": 1.8768399953842163, + "learning_rate": 1.4622658795105427e-06, + "loss": 0.693, + "step": 5914 + }, + { + "epoch": 0.76, + "grad_norm": 1.4486182928085327, + "learning_rate": 1.460799831381934e-06, + "loss": 0.5236, + "step": 5915 + }, + { + "epoch": 0.76, + "grad_norm": 1.1291416883468628, + "learning_rate": 1.4593343928086185e-06, + "loss": 0.6452, + "step": 5916 + }, + { + "epoch": 0.76, + "grad_norm": 1.273863434791565, + "learning_rate": 1.4578695640429863e-06, + "loss": 0.5713, + "step": 5917 + }, + { + "epoch": 0.76, + "grad_norm": 1.5281238555908203, + "learning_rate": 1.4564053453373245e-06, + "loss": 0.6862, + "step": 5918 + }, + { + "epoch": 0.76, + "grad_norm": 1.6069231033325195, + "learning_rate": 1.4549417369438162e-06, + "loss": 0.5982, + "step": 5919 + }, + { + "epoch": 0.76, + "grad_norm": 1.2721714973449707, + "learning_rate": 1.4534787391145388e-06, + "loss": 0.5969, + "step": 5920 + }, + { + "epoch": 0.76, + "grad_norm": 1.1203700304031372, + "learning_rate": 1.452016352101462e-06, + "loss": 0.5516, + "step": 5921 + }, + { + "epoch": 0.76, + "grad_norm": 1.6137348413467407, + "learning_rate": 1.4505545761564526e-06, + "loss": 0.6058, + "step": 5922 + }, + { + "epoch": 0.76, + "grad_norm": 1.4966486692428589, + "learning_rate": 1.4490934115312721e-06, + "loss": 0.6258, + "step": 5923 + }, + { + "epoch": 0.76, + "grad_norm": 1.3931201696395874, + "learning_rate": 1.4476328584775784e-06, + "loss": 0.5624, + "step": 5924 + }, + { + "epoch": 0.76, + "grad_norm": 1.4084396362304688, + "learning_rate": 1.4461729172469191e-06, + "loss": 0.52, + "step": 5925 + }, + { + "epoch": 0.76, + "grad_norm": 1.3359655141830444, + "learning_rate": 1.4447135880907397e-06, + "loss": 0.5936, + "step": 5926 + }, + { + "epoch": 0.76, + "grad_norm": 1.4408459663391113, + "learning_rate": 1.4432548712603816e-06, + "loss": 0.5981, + "step": 5927 + }, + { + "epoch": 0.76, + "grad_norm": 1.4925096035003662, + "learning_rate": 1.4417967670070799e-06, + "loss": 0.556, + "step": 5928 + }, + { + "epoch": 0.76, + "grad_norm": 2.2388222217559814, + "learning_rate": 1.44033927558196e-06, + "loss": 0.5848, + "step": 5929 + }, + { + "epoch": 0.76, + "grad_norm": 1.3931220769882202, + "learning_rate": 1.4388823972360455e-06, + "loss": 0.5938, + "step": 5930 + }, + { + "epoch": 0.76, + "grad_norm": 1.208591103553772, + "learning_rate": 1.4374261322202592e-06, + "loss": 0.6013, + "step": 5931 + }, + { + "epoch": 0.76, + "grad_norm": 1.465299129486084, + "learning_rate": 1.435970480785408e-06, + "loss": 0.6509, + "step": 5932 + }, + { + "epoch": 0.76, + "grad_norm": 1.5342435836791992, + "learning_rate": 1.4345154431821995e-06, + "loss": 0.636, + "step": 5933 + }, + { + "epoch": 0.76, + "grad_norm": 1.555898666381836, + "learning_rate": 1.4330610196612348e-06, + "loss": 0.5484, + "step": 5934 + }, + { + "epoch": 0.76, + "grad_norm": 1.423935055732727, + "learning_rate": 1.4316072104730106e-06, + "loss": 0.5434, + "step": 5935 + }, + { + "epoch": 0.76, + "grad_norm": 1.2774029970169067, + "learning_rate": 1.4301540158679123e-06, + "loss": 0.5129, + "step": 5936 + }, + { + "epoch": 0.76, + "grad_norm": 1.258390188217163, + "learning_rate": 1.4287014360962247e-06, + "loss": 0.5213, + "step": 5937 + }, + { + "epoch": 0.76, + "grad_norm": 1.3220731019973755, + "learning_rate": 1.427249471408126e-06, + "loss": 0.5337, + "step": 5938 + }, + { + "epoch": 0.76, + "grad_norm": 1.4194802045822144, + "learning_rate": 1.4257981220536883e-06, + "loss": 0.567, + "step": 5939 + }, + { + "epoch": 0.76, + "grad_norm": 1.372889757156372, + "learning_rate": 1.424347388282874e-06, + "loss": 0.6144, + "step": 5940 + }, + { + "epoch": 0.76, + "grad_norm": 1.7539900541305542, + "learning_rate": 1.4228972703455441e-06, + "loss": 0.6644, + "step": 5941 + }, + { + "epoch": 0.76, + "grad_norm": 1.544344186782837, + "learning_rate": 1.4214477684914524e-06, + "loss": 0.6441, + "step": 5942 + }, + { + "epoch": 0.76, + "grad_norm": 1.3730299472808838, + "learning_rate": 1.4199988829702472e-06, + "loss": 0.5675, + "step": 5943 + }, + { + "epoch": 0.76, + "grad_norm": 1.2083449363708496, + "learning_rate": 1.4185506140314665e-06, + "loss": 0.5938, + "step": 5944 + }, + { + "epoch": 0.76, + "grad_norm": 1.5624059438705444, + "learning_rate": 1.4171029619245468e-06, + "loss": 0.6969, + "step": 5945 + }, + { + "epoch": 0.76, + "grad_norm": 1.3644176721572876, + "learning_rate": 1.4156559268988168e-06, + "loss": 0.5768, + "step": 5946 + }, + { + "epoch": 0.76, + "grad_norm": 1.3280311822891235, + "learning_rate": 1.414209509203499e-06, + "loss": 0.6175, + "step": 5947 + }, + { + "epoch": 0.76, + "grad_norm": 1.4516396522521973, + "learning_rate": 1.4127637090877094e-06, + "loss": 0.5231, + "step": 5948 + }, + { + "epoch": 0.76, + "grad_norm": 1.298318862915039, + "learning_rate": 1.4113185268004576e-06, + "loss": 0.5446, + "step": 5949 + }, + { + "epoch": 0.76, + "grad_norm": 1.2986234426498413, + "learning_rate": 1.4098739625906482e-06, + "loss": 0.5896, + "step": 5950 + }, + { + "epoch": 0.76, + "grad_norm": 1.3965665102005005, + "learning_rate": 1.4084300167070758e-06, + "loss": 0.6212, + "step": 5951 + }, + { + "epoch": 0.76, + "grad_norm": 1.5937730073928833, + "learning_rate": 1.4069866893984307e-06, + "loss": 0.5914, + "step": 5952 + }, + { + "epoch": 0.76, + "grad_norm": 2.7656188011169434, + "learning_rate": 1.4055439809132987e-06, + "loss": 0.5545, + "step": 5953 + }, + { + "epoch": 0.76, + "grad_norm": 1.46426260471344, + "learning_rate": 1.4041018915001564e-06, + "loss": 0.58, + "step": 5954 + }, + { + "epoch": 0.76, + "grad_norm": 1.5463647842407227, + "learning_rate": 1.4026604214073731e-06, + "loss": 0.5886, + "step": 5955 + }, + { + "epoch": 0.76, + "grad_norm": 1.4944566488265991, + "learning_rate": 1.4012195708832132e-06, + "loss": 0.5805, + "step": 5956 + }, + { + "epoch": 0.76, + "grad_norm": 1.733450174331665, + "learning_rate": 1.3997793401758347e-06, + "loss": 0.5651, + "step": 5957 + }, + { + "epoch": 0.76, + "grad_norm": 1.2912808656692505, + "learning_rate": 1.398339729533289e-06, + "loss": 0.6123, + "step": 5958 + }, + { + "epoch": 0.76, + "grad_norm": 1.4565380811691284, + "learning_rate": 1.3969007392035166e-06, + "loss": 0.656, + "step": 5959 + }, + { + "epoch": 0.76, + "grad_norm": 1.2773964405059814, + "learning_rate": 1.3954623694343566e-06, + "loss": 0.5466, + "step": 5960 + }, + { + "epoch": 0.76, + "grad_norm": 1.2596515417099, + "learning_rate": 1.3940246204735398e-06, + "loss": 0.5493, + "step": 5961 + }, + { + "epoch": 0.76, + "grad_norm": 1.3928008079528809, + "learning_rate": 1.392587492568686e-06, + "loss": 0.5857, + "step": 5962 + }, + { + "epoch": 0.76, + "grad_norm": 1.3344323635101318, + "learning_rate": 1.3911509859673138e-06, + "loss": 0.5673, + "step": 5963 + }, + { + "epoch": 0.76, + "grad_norm": 1.650927186012268, + "learning_rate": 1.3897151009168297e-06, + "loss": 0.5837, + "step": 5964 + }, + { + "epoch": 0.76, + "grad_norm": 1.2089368104934692, + "learning_rate": 1.3882798376645413e-06, + "loss": 0.5227, + "step": 5965 + }, + { + "epoch": 0.76, + "grad_norm": 1.2477643489837646, + "learning_rate": 1.386845196457638e-06, + "loss": 0.5825, + "step": 5966 + }, + { + "epoch": 0.76, + "grad_norm": 1.5391136407852173, + "learning_rate": 1.3854111775432093e-06, + "loss": 0.6396, + "step": 5967 + }, + { + "epoch": 0.76, + "grad_norm": 1.506758213043213, + "learning_rate": 1.383977781168238e-06, + "loss": 0.5131, + "step": 5968 + }, + { + "epoch": 0.76, + "grad_norm": 1.46280038356781, + "learning_rate": 1.3825450075795937e-06, + "loss": 0.5954, + "step": 5969 + }, + { + "epoch": 0.76, + "grad_norm": 1.0830283164978027, + "learning_rate": 1.3811128570240445e-06, + "loss": 0.6811, + "step": 5970 + }, + { + "epoch": 0.76, + "grad_norm": 1.7907627820968628, + "learning_rate": 1.3796813297482486e-06, + "loss": 0.6008, + "step": 5971 + }, + { + "epoch": 0.77, + "grad_norm": 1.3350781202316284, + "learning_rate": 1.3782504259987601e-06, + "loss": 0.538, + "step": 5972 + }, + { + "epoch": 0.77, + "grad_norm": 1.362600326538086, + "learning_rate": 1.3768201460220187e-06, + "loss": 0.5904, + "step": 5973 + }, + { + "epoch": 0.77, + "grad_norm": 1.149887204170227, + "learning_rate": 1.375390490064364e-06, + "loss": 0.5439, + "step": 5974 + }, + { + "epoch": 0.77, + "grad_norm": 1.1738979816436768, + "learning_rate": 1.373961458372024e-06, + "loss": 0.535, + "step": 5975 + }, + { + "epoch": 0.77, + "grad_norm": 1.3399025201797485, + "learning_rate": 1.3725330511911223e-06, + "loss": 0.5719, + "step": 5976 + }, + { + "epoch": 0.77, + "grad_norm": 1.3920693397521973, + "learning_rate": 1.3711052687676701e-06, + "loss": 0.6096, + "step": 5977 + }, + { + "epoch": 0.77, + "grad_norm": 1.3506783246994019, + "learning_rate": 1.3696781113475754e-06, + "loss": 0.594, + "step": 5978 + }, + { + "epoch": 0.77, + "grad_norm": 1.2706949710845947, + "learning_rate": 1.3682515791766372e-06, + "loss": 0.5891, + "step": 5979 + }, + { + "epoch": 0.77, + "grad_norm": 1.2764166593551636, + "learning_rate": 1.3668256725005475e-06, + "loss": 0.6208, + "step": 5980 + }, + { + "epoch": 0.77, + "grad_norm": 1.4557310342788696, + "learning_rate": 1.3654003915648873e-06, + "loss": 0.558, + "step": 5981 + }, + { + "epoch": 0.77, + "grad_norm": 1.1343717575073242, + "learning_rate": 1.3639757366151323e-06, + "loss": 0.5511, + "step": 5982 + }, + { + "epoch": 0.77, + "grad_norm": 1.4977566003799438, + "learning_rate": 1.3625517078966544e-06, + "loss": 0.5928, + "step": 5983 + }, + { + "epoch": 0.77, + "grad_norm": 1.3123750686645508, + "learning_rate": 1.3611283056547097e-06, + "loss": 0.5015, + "step": 5984 + }, + { + "epoch": 0.77, + "grad_norm": 1.268973469734192, + "learning_rate": 1.3597055301344515e-06, + "loss": 0.6172, + "step": 5985 + }, + { + "epoch": 0.77, + "grad_norm": 1.3133175373077393, + "learning_rate": 1.3582833815809244e-06, + "loss": 0.6634, + "step": 5986 + }, + { + "epoch": 0.77, + "grad_norm": 1.5407078266143799, + "learning_rate": 1.356861860239065e-06, + "loss": 0.6416, + "step": 5987 + }, + { + "epoch": 0.77, + "grad_norm": 1.2365789413452148, + "learning_rate": 1.3554409663536993e-06, + "loss": 0.6078, + "step": 5988 + }, + { + "epoch": 0.77, + "grad_norm": 1.4890007972717285, + "learning_rate": 1.3540207001695489e-06, + "loss": 0.5228, + "step": 5989 + }, + { + "epoch": 0.77, + "grad_norm": 1.1936070919036865, + "learning_rate": 1.3526010619312252e-06, + "loss": 0.6939, + "step": 5990 + }, + { + "epoch": 0.77, + "grad_norm": 1.4938281774520874, + "learning_rate": 1.3511820518832342e-06, + "loss": 0.628, + "step": 5991 + }, + { + "epoch": 0.77, + "grad_norm": 1.7732173204421997, + "learning_rate": 1.3497636702699684e-06, + "loss": 0.6403, + "step": 5992 + }, + { + "epoch": 0.77, + "grad_norm": 1.5048422813415527, + "learning_rate": 1.3483459173357167e-06, + "loss": 0.6572, + "step": 5993 + }, + { + "epoch": 0.77, + "grad_norm": 1.2302240133285522, + "learning_rate": 1.3469287933246577e-06, + "loss": 0.562, + "step": 5994 + }, + { + "epoch": 0.77, + "grad_norm": 1.197856068611145, + "learning_rate": 1.3455122984808644e-06, + "loss": 0.677, + "step": 5995 + }, + { + "epoch": 0.77, + "grad_norm": 1.3732661008834839, + "learning_rate": 1.3440964330482958e-06, + "loss": 0.6358, + "step": 5996 + }, + { + "epoch": 0.77, + "grad_norm": 1.2321995496749878, + "learning_rate": 1.3426811972708076e-06, + "loss": 0.6068, + "step": 5997 + }, + { + "epoch": 0.77, + "grad_norm": 1.1939703226089478, + "learning_rate": 1.3412665913921451e-06, + "loss": 0.5675, + "step": 5998 + }, + { + "epoch": 0.77, + "grad_norm": 1.3941285610198975, + "learning_rate": 1.339852615655946e-06, + "loss": 0.612, + "step": 5999 + }, + { + "epoch": 0.77, + "grad_norm": 1.4940359592437744, + "learning_rate": 1.3384392703057386e-06, + "loss": 0.653, + "step": 6000 + }, + { + "epoch": 0.77, + "grad_norm": 1.482255220413208, + "learning_rate": 1.3370265555849427e-06, + "loss": 0.495, + "step": 6001 + }, + { + "epoch": 0.77, + "grad_norm": 1.4018694162368774, + "learning_rate": 1.335614471736872e-06, + "loss": 0.6908, + "step": 6002 + }, + { + "epoch": 0.77, + "grad_norm": 1.2565648555755615, + "learning_rate": 1.334203019004725e-06, + "loss": 0.5861, + "step": 6003 + }, + { + "epoch": 0.77, + "grad_norm": 1.4649759531021118, + "learning_rate": 1.3327921976315977e-06, + "loss": 0.5378, + "step": 6004 + }, + { + "epoch": 0.77, + "grad_norm": 1.391389012336731, + "learning_rate": 1.3313820078604761e-06, + "loss": 0.6189, + "step": 6005 + }, + { + "epoch": 0.77, + "grad_norm": 1.4627043008804321, + "learning_rate": 1.3299724499342376e-06, + "loss": 0.6309, + "step": 6006 + }, + { + "epoch": 0.77, + "grad_norm": 1.310528039932251, + "learning_rate": 1.3285635240956469e-06, + "loss": 0.6002, + "step": 6007 + }, + { + "epoch": 0.77, + "grad_norm": 1.153878092765808, + "learning_rate": 1.3271552305873648e-06, + "loss": 0.6072, + "step": 6008 + }, + { + "epoch": 0.77, + "grad_norm": 1.3677489757537842, + "learning_rate": 1.3257475696519417e-06, + "loss": 0.6208, + "step": 6009 + }, + { + "epoch": 0.77, + "grad_norm": 1.3258711099624634, + "learning_rate": 1.3243405415318166e-06, + "loss": 0.5862, + "step": 6010 + }, + { + "epoch": 0.77, + "grad_norm": 1.423606038093567, + "learning_rate": 1.322934146469323e-06, + "loss": 0.5903, + "step": 6011 + }, + { + "epoch": 0.77, + "grad_norm": 1.480432152748108, + "learning_rate": 1.3215283847066828e-06, + "loss": 0.4928, + "step": 6012 + }, + { + "epoch": 0.77, + "grad_norm": 1.5109148025512695, + "learning_rate": 1.3201232564860128e-06, + "loss": 0.6378, + "step": 6013 + }, + { + "epoch": 0.77, + "grad_norm": 1.531599998474121, + "learning_rate": 1.3187187620493137e-06, + "loss": 0.6011, + "step": 6014 + }, + { + "epoch": 0.77, + "grad_norm": 1.9274752140045166, + "learning_rate": 1.3173149016384829e-06, + "loss": 0.5542, + "step": 6015 + }, + { + "epoch": 0.77, + "grad_norm": 1.15962553024292, + "learning_rate": 1.3159116754953073e-06, + "loss": 0.5979, + "step": 6016 + }, + { + "epoch": 0.77, + "grad_norm": 1.3300819396972656, + "learning_rate": 1.3145090838614633e-06, + "loss": 0.538, + "step": 6017 + }, + { + "epoch": 0.77, + "grad_norm": 1.3785098791122437, + "learning_rate": 1.3131071269785196e-06, + "loss": 0.6167, + "step": 6018 + }, + { + "epoch": 0.77, + "grad_norm": 1.139225721359253, + "learning_rate": 1.3117058050879344e-06, + "loss": 0.7076, + "step": 6019 + }, + { + "epoch": 0.77, + "grad_norm": 1.5697603225708008, + "learning_rate": 1.3103051184310577e-06, + "loss": 0.614, + "step": 6020 + }, + { + "epoch": 0.77, + "grad_norm": 1.3997340202331543, + "learning_rate": 1.3089050672491278e-06, + "loss": 0.5386, + "step": 6021 + }, + { + "epoch": 0.77, + "grad_norm": 1.4354143142700195, + "learning_rate": 1.3075056517832758e-06, + "loss": 0.6331, + "step": 6022 + }, + { + "epoch": 0.77, + "grad_norm": 1.3406672477722168, + "learning_rate": 1.306106872274522e-06, + "loss": 0.5646, + "step": 6023 + }, + { + "epoch": 0.77, + "grad_norm": 1.3164324760437012, + "learning_rate": 1.30470872896378e-06, + "loss": 0.582, + "step": 6024 + }, + { + "epoch": 0.77, + "grad_norm": 1.4579137563705444, + "learning_rate": 1.3033112220918482e-06, + "loss": 0.6018, + "step": 6025 + }, + { + "epoch": 0.77, + "grad_norm": 1.4018653631210327, + "learning_rate": 1.3019143518994203e-06, + "loss": 0.6285, + "step": 6026 + }, + { + "epoch": 0.77, + "grad_norm": 1.6690030097961426, + "learning_rate": 1.3005181186270794e-06, + "loss": 0.6455, + "step": 6027 + }, + { + "epoch": 0.77, + "grad_norm": 1.20956552028656, + "learning_rate": 1.2991225225152988e-06, + "loss": 0.5629, + "step": 6028 + }, + { + "epoch": 0.77, + "grad_norm": 1.3824257850646973, + "learning_rate": 1.297727563804439e-06, + "loss": 0.5708, + "step": 6029 + }, + { + "epoch": 0.77, + "grad_norm": 1.7339993715286255, + "learning_rate": 1.2963332427347552e-06, + "loss": 0.6303, + "step": 6030 + }, + { + "epoch": 0.77, + "grad_norm": 1.2081109285354614, + "learning_rate": 1.2949395595463904e-06, + "loss": 0.6148, + "step": 6031 + }, + { + "epoch": 0.77, + "grad_norm": 1.271765947341919, + "learning_rate": 1.2935465144793797e-06, + "loss": 0.5637, + "step": 6032 + }, + { + "epoch": 0.77, + "grad_norm": 1.3769280910491943, + "learning_rate": 1.2921541077736428e-06, + "loss": 0.565, + "step": 6033 + }, + { + "epoch": 0.77, + "grad_norm": 1.288026213645935, + "learning_rate": 1.2907623396689973e-06, + "loss": 0.5762, + "step": 6034 + }, + { + "epoch": 0.77, + "grad_norm": 2.817467451095581, + "learning_rate": 1.289371210405147e-06, + "loss": 0.6836, + "step": 6035 + }, + { + "epoch": 0.77, + "grad_norm": 1.3072487115859985, + "learning_rate": 1.2879807202216826e-06, + "loss": 0.5799, + "step": 6036 + }, + { + "epoch": 0.77, + "grad_norm": 1.3482693433761597, + "learning_rate": 1.2865908693580903e-06, + "loss": 0.6028, + "step": 6037 + }, + { + "epoch": 0.77, + "grad_norm": 1.5620173215866089, + "learning_rate": 1.2852016580537424e-06, + "loss": 0.5214, + "step": 6038 + }, + { + "epoch": 0.77, + "grad_norm": 1.521825909614563, + "learning_rate": 1.2838130865479049e-06, + "loss": 0.6226, + "step": 6039 + }, + { + "epoch": 0.77, + "grad_norm": 1.7323329448699951, + "learning_rate": 1.2824251550797268e-06, + "loss": 0.5745, + "step": 6040 + }, + { + "epoch": 0.77, + "grad_norm": 1.4529424905776978, + "learning_rate": 1.281037863888253e-06, + "loss": 0.6783, + "step": 6041 + }, + { + "epoch": 0.77, + "grad_norm": 1.3162099123001099, + "learning_rate": 1.2796512132124172e-06, + "loss": 0.6138, + "step": 6042 + }, + { + "epoch": 0.77, + "grad_norm": 1.3417941331863403, + "learning_rate": 1.278265203291042e-06, + "loss": 0.651, + "step": 6043 + }, + { + "epoch": 0.77, + "grad_norm": 1.2643450498580933, + "learning_rate": 1.2768798343628368e-06, + "loss": 0.5403, + "step": 6044 + }, + { + "epoch": 0.77, + "grad_norm": 1.5022494792938232, + "learning_rate": 1.2754951066664056e-06, + "loss": 0.592, + "step": 6045 + }, + { + "epoch": 0.77, + "grad_norm": 1.4565330743789673, + "learning_rate": 1.274111020440238e-06, + "loss": 0.5484, + "step": 6046 + }, + { + "epoch": 0.77, + "grad_norm": 1.3392274379730225, + "learning_rate": 1.2727275759227175e-06, + "loss": 0.6221, + "step": 6047 + }, + { + "epoch": 0.77, + "grad_norm": 1.425985336303711, + "learning_rate": 1.271344773352111e-06, + "loss": 0.5557, + "step": 6048 + }, + { + "epoch": 0.77, + "grad_norm": 1.3691293001174927, + "learning_rate": 1.2699626129665798e-06, + "loss": 0.6133, + "step": 6049 + }, + { + "epoch": 0.78, + "grad_norm": 1.5187207460403442, + "learning_rate": 1.2685810950041722e-06, + "loss": 0.6545, + "step": 6050 + }, + { + "epoch": 0.78, + "grad_norm": 1.3241124153137207, + "learning_rate": 1.267200219702827e-06, + "loss": 0.5776, + "step": 6051 + }, + { + "epoch": 0.78, + "grad_norm": 1.4314247369766235, + "learning_rate": 1.2658199873003723e-06, + "loss": 0.5092, + "step": 6052 + }, + { + "epoch": 0.78, + "grad_norm": 1.4745780229568481, + "learning_rate": 1.264440398034525e-06, + "loss": 0.561, + "step": 6053 + }, + { + "epoch": 0.78, + "grad_norm": 1.2822953462600708, + "learning_rate": 1.2630614521428919e-06, + "loss": 0.601, + "step": 6054 + }, + { + "epoch": 0.78, + "grad_norm": 1.345860242843628, + "learning_rate": 1.2616831498629668e-06, + "loss": 0.5509, + "step": 6055 + }, + { + "epoch": 0.78, + "grad_norm": 1.3719156980514526, + "learning_rate": 1.2603054914321343e-06, + "loss": 0.5773, + "step": 6056 + }, + { + "epoch": 0.78, + "grad_norm": 1.6491966247558594, + "learning_rate": 1.2589284770876692e-06, + "loss": 0.5813, + "step": 6057 + }, + { + "epoch": 0.78, + "grad_norm": 1.3380208015441895, + "learning_rate": 1.2575521070667352e-06, + "loss": 0.6066, + "step": 6058 + }, + { + "epoch": 0.78, + "grad_norm": 1.6886008977890015, + "learning_rate": 1.256176381606381e-06, + "loss": 0.6094, + "step": 6059 + }, + { + "epoch": 0.78, + "grad_norm": 1.4857572317123413, + "learning_rate": 1.2548013009435495e-06, + "loss": 0.5453, + "step": 6060 + }, + { + "epoch": 0.78, + "grad_norm": 1.2749769687652588, + "learning_rate": 1.2534268653150705e-06, + "loss": 0.5806, + "step": 6061 + }, + { + "epoch": 0.78, + "grad_norm": 1.563644528388977, + "learning_rate": 1.2520530749576615e-06, + "loss": 0.6023, + "step": 6062 + }, + { + "epoch": 0.78, + "grad_norm": 1.2830228805541992, + "learning_rate": 1.2506799301079298e-06, + "loss": 0.5755, + "step": 6063 + }, + { + "epoch": 0.78, + "grad_norm": 1.7365049123764038, + "learning_rate": 1.2493074310023728e-06, + "loss": 0.5773, + "step": 6064 + }, + { + "epoch": 0.78, + "grad_norm": 1.2732226848602295, + "learning_rate": 1.2479355778773761e-06, + "loss": 0.5507, + "step": 6065 + }, + { + "epoch": 0.78, + "grad_norm": 1.5290393829345703, + "learning_rate": 1.2465643709692115e-06, + "loss": 0.6324, + "step": 6066 + }, + { + "epoch": 0.78, + "grad_norm": 2.335749387741089, + "learning_rate": 1.245193810514041e-06, + "loss": 0.6078, + "step": 6067 + }, + { + "epoch": 0.78, + "grad_norm": 1.4709378480911255, + "learning_rate": 1.2438238967479205e-06, + "loss": 0.6015, + "step": 6068 + }, + { + "epoch": 0.78, + "grad_norm": 1.3493144512176514, + "learning_rate": 1.2424546299067846e-06, + "loss": 0.6496, + "step": 6069 + }, + { + "epoch": 0.78, + "grad_norm": 1.2655116319656372, + "learning_rate": 1.2410860102264644e-06, + "loss": 0.5568, + "step": 6070 + }, + { + "epoch": 0.78, + "grad_norm": 1.2244620323181152, + "learning_rate": 1.239718037942676e-06, + "loss": 0.6423, + "step": 6071 + }, + { + "epoch": 0.78, + "grad_norm": 1.434319019317627, + "learning_rate": 1.238350713291026e-06, + "loss": 0.6144, + "step": 6072 + }, + { + "epoch": 0.78, + "grad_norm": 1.324418067932129, + "learning_rate": 1.2369840365070063e-06, + "loss": 0.5737, + "step": 6073 + }, + { + "epoch": 0.78, + "grad_norm": 1.387671709060669, + "learning_rate": 1.235618007826e-06, + "loss": 0.6031, + "step": 6074 + }, + { + "epoch": 0.78, + "grad_norm": 1.4040278196334839, + "learning_rate": 1.2342526274832778e-06, + "loss": 0.6111, + "step": 6075 + }, + { + "epoch": 0.78, + "grad_norm": 1.373987078666687, + "learning_rate": 1.2328878957140006e-06, + "loss": 0.5173, + "step": 6076 + }, + { + "epoch": 0.78, + "grad_norm": 1.1661827564239502, + "learning_rate": 1.2315238127532126e-06, + "loss": 0.5731, + "step": 6077 + }, + { + "epoch": 0.78, + "grad_norm": 1.3571733236312866, + "learning_rate": 1.2301603788358501e-06, + "loss": 0.6028, + "step": 6078 + }, + { + "epoch": 0.78, + "grad_norm": 1.2105902433395386, + "learning_rate": 1.2287975941967377e-06, + "loss": 0.5522, + "step": 6079 + }, + { + "epoch": 0.78, + "grad_norm": 1.1787046194076538, + "learning_rate": 1.2274354590705888e-06, + "loss": 0.5634, + "step": 6080 + }, + { + "epoch": 0.78, + "grad_norm": 1.513954520225525, + "learning_rate": 1.2260739736920002e-06, + "loss": 0.5923, + "step": 6081 + }, + { + "epoch": 0.78, + "grad_norm": 1.3857628107070923, + "learning_rate": 1.224713138295462e-06, + "loss": 0.5761, + "step": 6082 + }, + { + "epoch": 0.78, + "grad_norm": 2.2354791164398193, + "learning_rate": 1.22335295311535e-06, + "loss": 0.5485, + "step": 6083 + }, + { + "epoch": 0.78, + "grad_norm": 1.4245437383651733, + "learning_rate": 1.2219934183859283e-06, + "loss": 0.6352, + "step": 6084 + }, + { + "epoch": 0.78, + "grad_norm": 1.455147385597229, + "learning_rate": 1.22063453434135e-06, + "loss": 0.6363, + "step": 6085 + }, + { + "epoch": 0.78, + "grad_norm": 1.2811365127563477, + "learning_rate": 1.2192763012156544e-06, + "loss": 0.5653, + "step": 6086 + }, + { + "epoch": 0.78, + "grad_norm": 1.1058675050735474, + "learning_rate": 1.2179187192427716e-06, + "loss": 0.708, + "step": 6087 + }, + { + "epoch": 0.78, + "grad_norm": 1.4116437435150146, + "learning_rate": 1.216561788656514e-06, + "loss": 0.6089, + "step": 6088 + }, + { + "epoch": 0.78, + "grad_norm": 1.2147979736328125, + "learning_rate": 1.2152055096905868e-06, + "loss": 0.5706, + "step": 6089 + }, + { + "epoch": 0.78, + "grad_norm": 1.238149881362915, + "learning_rate": 1.2138498825785822e-06, + "loss": 0.5763, + "step": 6090 + }, + { + "epoch": 0.78, + "grad_norm": 1.7066261768341064, + "learning_rate": 1.21249490755398e-06, + "loss": 0.5693, + "step": 6091 + }, + { + "epoch": 0.78, + "grad_norm": 1.3570530414581299, + "learning_rate": 1.2111405848501451e-06, + "loss": 0.6234, + "step": 6092 + }, + { + "epoch": 0.78, + "grad_norm": 1.259835958480835, + "learning_rate": 1.2097869147003328e-06, + "loss": 0.5633, + "step": 6093 + }, + { + "epoch": 0.78, + "grad_norm": 1.6666969060897827, + "learning_rate": 1.2084338973376853e-06, + "loss": 0.575, + "step": 6094 + }, + { + "epoch": 0.78, + "grad_norm": 0.9863353967666626, + "learning_rate": 1.207081532995234e-06, + "loss": 0.6823, + "step": 6095 + }, + { + "epoch": 0.78, + "grad_norm": 1.477820873260498, + "learning_rate": 1.2057298219058933e-06, + "loss": 0.6011, + "step": 6096 + }, + { + "epoch": 0.78, + "grad_norm": 1.3085917234420776, + "learning_rate": 1.204378764302469e-06, + "loss": 0.5428, + "step": 6097 + }, + { + "epoch": 0.78, + "grad_norm": 1.393518090248108, + "learning_rate": 1.2030283604176545e-06, + "loss": 0.6187, + "step": 6098 + }, + { + "epoch": 0.78, + "grad_norm": 1.3972924947738647, + "learning_rate": 1.2016786104840296e-06, + "loss": 0.6148, + "step": 6099 + }, + { + "epoch": 0.78, + "grad_norm": 1.4566419124603271, + "learning_rate": 1.2003295147340587e-06, + "loss": 0.6794, + "step": 6100 + }, + { + "epoch": 0.78, + "grad_norm": 1.3721485137939453, + "learning_rate": 1.1989810734000962e-06, + "loss": 0.5104, + "step": 6101 + }, + { + "epoch": 0.78, + "grad_norm": 1.3800498247146606, + "learning_rate": 1.1976332867143887e-06, + "loss": 0.4953, + "step": 6102 + }, + { + "epoch": 0.78, + "grad_norm": 1.5365797281265259, + "learning_rate": 1.1962861549090593e-06, + "loss": 0.607, + "step": 6103 + }, + { + "epoch": 0.78, + "grad_norm": 1.18382728099823, + "learning_rate": 1.1949396782161272e-06, + "loss": 0.725, + "step": 6104 + }, + { + "epoch": 0.78, + "grad_norm": 1.2551465034484863, + "learning_rate": 1.1935938568674943e-06, + "loss": 0.6576, + "step": 6105 + }, + { + "epoch": 0.78, + "grad_norm": 1.2952631711959839, + "learning_rate": 1.1922486910949527e-06, + "loss": 0.5513, + "step": 6106 + }, + { + "epoch": 0.78, + "grad_norm": 1.188146948814392, + "learning_rate": 1.1909041811301775e-06, + "loss": 0.5895, + "step": 6107 + }, + { + "epoch": 0.78, + "grad_norm": 1.3294532299041748, + "learning_rate": 1.189560327204734e-06, + "loss": 0.5671, + "step": 6108 + }, + { + "epoch": 0.78, + "grad_norm": 1.3476665019989014, + "learning_rate": 1.188217129550076e-06, + "loss": 0.6287, + "step": 6109 + }, + { + "epoch": 0.78, + "grad_norm": 1.3214592933654785, + "learning_rate": 1.1868745883975386e-06, + "loss": 0.5985, + "step": 6110 + }, + { + "epoch": 0.78, + "grad_norm": 1.6524791717529297, + "learning_rate": 1.1855327039783481e-06, + "loss": 0.6008, + "step": 6111 + }, + { + "epoch": 0.78, + "grad_norm": 1.334662914276123, + "learning_rate": 1.1841914765236178e-06, + "loss": 0.5857, + "step": 6112 + }, + { + "epoch": 0.78, + "grad_norm": 5.622629642486572, + "learning_rate": 1.182850906264348e-06, + "loss": 0.663, + "step": 6113 + }, + { + "epoch": 0.78, + "grad_norm": 1.1818311214447021, + "learning_rate": 1.1815109934314218e-06, + "loss": 0.5183, + "step": 6114 + }, + { + "epoch": 0.78, + "grad_norm": 1.55926513671875, + "learning_rate": 1.1801717382556133e-06, + "loss": 0.6356, + "step": 6115 + }, + { + "epoch": 0.78, + "grad_norm": 1.465349793434143, + "learning_rate": 1.1788331409675812e-06, + "loss": 0.5734, + "step": 6116 + }, + { + "epoch": 0.78, + "grad_norm": 1.333842396736145, + "learning_rate": 1.1774952017978748e-06, + "loss": 0.54, + "step": 6117 + }, + { + "epoch": 0.78, + "grad_norm": 1.6432085037231445, + "learning_rate": 1.1761579209769226e-06, + "loss": 0.6167, + "step": 6118 + }, + { + "epoch": 0.78, + "grad_norm": 1.2257115840911865, + "learning_rate": 1.174821298735045e-06, + "loss": 0.5268, + "step": 6119 + }, + { + "epoch": 0.78, + "grad_norm": 1.3372501134872437, + "learning_rate": 1.1734853353024517e-06, + "loss": 0.6231, + "step": 6120 + }, + { + "epoch": 0.78, + "grad_norm": 1.5983772277832031, + "learning_rate": 1.172150030909231e-06, + "loss": 0.6376, + "step": 6121 + }, + { + "epoch": 0.78, + "grad_norm": 1.557028889656067, + "learning_rate": 1.1708153857853637e-06, + "loss": 0.661, + "step": 6122 + }, + { + "epoch": 0.78, + "grad_norm": 1.520965576171875, + "learning_rate": 1.1694814001607152e-06, + "loss": 0.6603, + "step": 6123 + }, + { + "epoch": 0.78, + "grad_norm": 1.3232431411743164, + "learning_rate": 1.1681480742650387e-06, + "loss": 0.5791, + "step": 6124 + }, + { + "epoch": 0.78, + "grad_norm": 1.9522755146026611, + "learning_rate": 1.1668154083279692e-06, + "loss": 0.5742, + "step": 6125 + }, + { + "epoch": 0.78, + "grad_norm": 1.4064555168151855, + "learning_rate": 1.165483402579034e-06, + "loss": 0.6266, + "step": 6126 + }, + { + "epoch": 0.78, + "grad_norm": 1.2465460300445557, + "learning_rate": 1.1641520572476428e-06, + "loss": 0.5173, + "step": 6127 + }, + { + "epoch": 0.79, + "grad_norm": 1.656630039215088, + "learning_rate": 1.162821372563095e-06, + "loss": 0.5574, + "step": 6128 + }, + { + "epoch": 0.79, + "grad_norm": 1.7703921794891357, + "learning_rate": 1.1614913487545704e-06, + "loss": 0.6125, + "step": 6129 + }, + { + "epoch": 0.79, + "grad_norm": 1.557174801826477, + "learning_rate": 1.1601619860511403e-06, + "loss": 0.5712, + "step": 6130 + }, + { + "epoch": 0.79, + "grad_norm": 1.303775429725647, + "learning_rate": 1.1588332846817608e-06, + "loss": 0.5705, + "step": 6131 + }, + { + "epoch": 0.79, + "grad_norm": 1.4587818384170532, + "learning_rate": 1.1575052448752744e-06, + "loss": 0.5648, + "step": 6132 + }, + { + "epoch": 0.79, + "grad_norm": 1.3065063953399658, + "learning_rate": 1.1561778668604068e-06, + "loss": 0.5471, + "step": 6133 + }, + { + "epoch": 0.79, + "grad_norm": 1.1926515102386475, + "learning_rate": 1.1548511508657733e-06, + "loss": 0.5889, + "step": 6134 + }, + { + "epoch": 0.79, + "grad_norm": 1.834707260131836, + "learning_rate": 1.153525097119873e-06, + "loss": 0.5979, + "step": 6135 + }, + { + "epoch": 0.79, + "grad_norm": 1.8028165102005005, + "learning_rate": 1.1521997058510926e-06, + "loss": 0.6267, + "step": 6136 + }, + { + "epoch": 0.79, + "grad_norm": 1.2593283653259277, + "learning_rate": 1.1508749772877032e-06, + "loss": 0.604, + "step": 6137 + }, + { + "epoch": 0.79, + "grad_norm": 1.511449933052063, + "learning_rate": 1.1495509116578628e-06, + "loss": 0.6545, + "step": 6138 + }, + { + "epoch": 0.79, + "grad_norm": 1.41291344165802, + "learning_rate": 1.1482275091896155e-06, + "loss": 0.4631, + "step": 6139 + }, + { + "epoch": 0.79, + "grad_norm": 1.5056250095367432, + "learning_rate": 1.1469047701108888e-06, + "loss": 0.6018, + "step": 6140 + }, + { + "epoch": 0.79, + "grad_norm": 1.5309189558029175, + "learning_rate": 1.145582694649498e-06, + "loss": 0.6386, + "step": 6141 + }, + { + "epoch": 0.79, + "grad_norm": 1.509054183959961, + "learning_rate": 1.144261283033144e-06, + "loss": 0.5576, + "step": 6142 + }, + { + "epoch": 0.79, + "grad_norm": 2.0752272605895996, + "learning_rate": 1.1429405354894147e-06, + "loss": 0.6352, + "step": 6143 + }, + { + "epoch": 0.79, + "grad_norm": 1.3260945081710815, + "learning_rate": 1.1416204522457792e-06, + "loss": 0.5944, + "step": 6144 + }, + { + "epoch": 0.79, + "grad_norm": 1.1211093664169312, + "learning_rate": 1.140301033529596e-06, + "loss": 0.7431, + "step": 6145 + }, + { + "epoch": 0.79, + "grad_norm": 1.5786157846450806, + "learning_rate": 1.1389822795681088e-06, + "loss": 0.573, + "step": 6146 + }, + { + "epoch": 0.79, + "grad_norm": 1.4324467182159424, + "learning_rate": 1.1376641905884472e-06, + "loss": 0.5515, + "step": 6147 + }, + { + "epoch": 0.79, + "grad_norm": 1.2469350099563599, + "learning_rate": 1.1363467668176221e-06, + "loss": 0.532, + "step": 6148 + }, + { + "epoch": 0.79, + "grad_norm": 1.782609462738037, + "learning_rate": 1.135030008482535e-06, + "loss": 0.5981, + "step": 6149 + }, + { + "epoch": 0.79, + "grad_norm": 1.5515788793563843, + "learning_rate": 1.1337139158099708e-06, + "loss": 0.6569, + "step": 6150 + }, + { + "epoch": 0.79, + "grad_norm": 1.6036688089370728, + "learning_rate": 1.1323984890266005e-06, + "loss": 0.6541, + "step": 6151 + }, + { + "epoch": 0.79, + "grad_norm": 1.3179755210876465, + "learning_rate": 1.1310837283589771e-06, + "loss": 0.6114, + "step": 6152 + }, + { + "epoch": 0.79, + "grad_norm": 1.550906777381897, + "learning_rate": 1.1297696340335412e-06, + "loss": 0.614, + "step": 6153 + }, + { + "epoch": 0.79, + "grad_norm": 1.434679388999939, + "learning_rate": 1.1284562062766236e-06, + "loss": 0.5633, + "step": 6154 + }, + { + "epoch": 0.79, + "grad_norm": 1.4772613048553467, + "learning_rate": 1.1271434453144308e-06, + "loss": 0.609, + "step": 6155 + }, + { + "epoch": 0.79, + "grad_norm": 1.4170340299606323, + "learning_rate": 1.1258313513730613e-06, + "loss": 0.6429, + "step": 6156 + }, + { + "epoch": 0.79, + "grad_norm": 1.5838710069656372, + "learning_rate": 1.1245199246784965e-06, + "loss": 0.6858, + "step": 6157 + }, + { + "epoch": 0.79, + "grad_norm": 2.0961270332336426, + "learning_rate": 1.1232091654566024e-06, + "loss": 0.5789, + "step": 6158 + }, + { + "epoch": 0.79, + "grad_norm": 1.436582326889038, + "learning_rate": 1.1218990739331297e-06, + "loss": 0.6391, + "step": 6159 + }, + { + "epoch": 0.79, + "grad_norm": 1.3351346254348755, + "learning_rate": 1.120589650333717e-06, + "loss": 0.5651, + "step": 6160 + }, + { + "epoch": 0.79, + "grad_norm": 1.1929622888565063, + "learning_rate": 1.1192808948838863e-06, + "loss": 0.552, + "step": 6161 + }, + { + "epoch": 0.79, + "grad_norm": 1.3724346160888672, + "learning_rate": 1.1179728078090413e-06, + "loss": 0.6234, + "step": 6162 + }, + { + "epoch": 0.79, + "grad_norm": 1.2918885946273804, + "learning_rate": 1.1166653893344753e-06, + "loss": 0.5676, + "step": 6163 + }, + { + "epoch": 0.79, + "grad_norm": 1.776462435722351, + "learning_rate": 1.115358639685364e-06, + "loss": 0.5481, + "step": 6164 + }, + { + "epoch": 0.79, + "grad_norm": 1.4188241958618164, + "learning_rate": 1.11405255908677e-06, + "loss": 0.5834, + "step": 6165 + }, + { + "epoch": 0.79, + "grad_norm": 1.6242364645004272, + "learning_rate": 1.112747147763637e-06, + "loss": 0.6114, + "step": 6166 + }, + { + "epoch": 0.79, + "grad_norm": 1.0123103857040405, + "learning_rate": 1.1114424059407957e-06, + "loss": 0.7003, + "step": 6167 + }, + { + "epoch": 0.79, + "grad_norm": 1.5645592212677002, + "learning_rate": 1.1101383338429627e-06, + "loss": 0.6156, + "step": 6168 + }, + { + "epoch": 0.79, + "grad_norm": 1.3179627656936646, + "learning_rate": 1.1088349316947388e-06, + "loss": 0.5178, + "step": 6169 + }, + { + "epoch": 0.79, + "grad_norm": 1.7976934909820557, + "learning_rate": 1.107532199720604e-06, + "loss": 0.5481, + "step": 6170 + }, + { + "epoch": 0.79, + "grad_norm": 1.527823567390442, + "learning_rate": 1.1062301381449313e-06, + "loss": 0.6192, + "step": 6171 + }, + { + "epoch": 0.79, + "grad_norm": 1.5737080574035645, + "learning_rate": 1.1049287471919756e-06, + "loss": 0.6406, + "step": 6172 + }, + { + "epoch": 0.79, + "grad_norm": 1.55455482006073, + "learning_rate": 1.1036280270858712e-06, + "loss": 0.703, + "step": 6173 + }, + { + "epoch": 0.79, + "grad_norm": 1.3198282718658447, + "learning_rate": 1.1023279780506425e-06, + "loss": 0.6418, + "step": 6174 + }, + { + "epoch": 0.79, + "grad_norm": 2.4118762016296387, + "learning_rate": 1.1010286003101967e-06, + "loss": 0.6689, + "step": 6175 + }, + { + "epoch": 0.79, + "grad_norm": 1.9782161712646484, + "learning_rate": 1.0997298940883261e-06, + "loss": 0.5931, + "step": 6176 + }, + { + "epoch": 0.79, + "grad_norm": 2.633495807647705, + "learning_rate": 1.0984318596087035e-06, + "loss": 0.6079, + "step": 6177 + }, + { + "epoch": 0.79, + "grad_norm": 1.5618376731872559, + "learning_rate": 1.0971344970948911e-06, + "loss": 0.5518, + "step": 6178 + }, + { + "epoch": 0.79, + "grad_norm": 1.2417227029800415, + "learning_rate": 1.095837806770333e-06, + "loss": 0.5463, + "step": 6179 + }, + { + "epoch": 0.79, + "grad_norm": 2.2190401554107666, + "learning_rate": 1.0945417888583593e-06, + "loss": 0.648, + "step": 6180 + }, + { + "epoch": 0.79, + "grad_norm": 1.2779728174209595, + "learning_rate": 1.0932464435821789e-06, + "loss": 0.5593, + "step": 6181 + }, + { + "epoch": 0.79, + "grad_norm": 1.9929521083831787, + "learning_rate": 1.0919517711648914e-06, + "loss": 0.5269, + "step": 6182 + }, + { + "epoch": 0.79, + "grad_norm": 1.4795068502426147, + "learning_rate": 1.090657771829477e-06, + "loss": 0.6332, + "step": 6183 + }, + { + "epoch": 0.79, + "grad_norm": 1.3366481065750122, + "learning_rate": 1.0893644457988029e-06, + "loss": 0.5933, + "step": 6184 + }, + { + "epoch": 0.79, + "grad_norm": 1.3400702476501465, + "learning_rate": 1.0880717932956148e-06, + "loss": 0.5539, + "step": 6185 + }, + { + "epoch": 0.79, + "grad_norm": 1.3519129753112793, + "learning_rate": 1.0867798145425468e-06, + "loss": 0.5936, + "step": 6186 + }, + { + "epoch": 0.79, + "grad_norm": 1.3224157094955444, + "learning_rate": 1.085488509762117e-06, + "loss": 0.5686, + "step": 6187 + }, + { + "epoch": 0.79, + "grad_norm": 1.4233057498931885, + "learning_rate": 1.084197879176726e-06, + "loss": 0.7274, + "step": 6188 + }, + { + "epoch": 0.79, + "grad_norm": 1.6368494033813477, + "learning_rate": 1.0829079230086587e-06, + "loss": 0.5718, + "step": 6189 + }, + { + "epoch": 0.79, + "grad_norm": 1.5278626680374146, + "learning_rate": 1.0816186414800838e-06, + "loss": 0.5603, + "step": 6190 + }, + { + "epoch": 0.79, + "grad_norm": 1.375484824180603, + "learning_rate": 1.0803300348130552e-06, + "loss": 0.564, + "step": 6191 + }, + { + "epoch": 0.79, + "grad_norm": 2.002624273300171, + "learning_rate": 1.0790421032295067e-06, + "loss": 0.5248, + "step": 6192 + }, + { + "epoch": 0.79, + "grad_norm": 1.4766088724136353, + "learning_rate": 1.0777548469512588e-06, + "loss": 0.5901, + "step": 6193 + }, + { + "epoch": 0.79, + "grad_norm": 1.3245599269866943, + "learning_rate": 1.0764682662000165e-06, + "loss": 0.5882, + "step": 6194 + }, + { + "epoch": 0.79, + "grad_norm": 1.1820677518844604, + "learning_rate": 1.0751823611973677e-06, + "loss": 0.5516, + "step": 6195 + }, + { + "epoch": 0.79, + "grad_norm": 1.272409439086914, + "learning_rate": 1.0738971321647811e-06, + "loss": 0.6358, + "step": 6196 + }, + { + "epoch": 0.79, + "grad_norm": 1.234097957611084, + "learning_rate": 1.0726125793236124e-06, + "loss": 0.5142, + "step": 6197 + }, + { + "epoch": 0.79, + "grad_norm": 1.3627111911773682, + "learning_rate": 1.0713287028950997e-06, + "loss": 0.5465, + "step": 6198 + }, + { + "epoch": 0.79, + "grad_norm": 1.6191880702972412, + "learning_rate": 1.0700455031003665e-06, + "loss": 0.6199, + "step": 6199 + }, + { + "epoch": 0.79, + "grad_norm": 1.6177563667297363, + "learning_rate": 1.0687629801604143e-06, + "loss": 0.6624, + "step": 6200 + }, + { + "epoch": 0.79, + "grad_norm": 1.1564007997512817, + "learning_rate": 1.0674811342961332e-06, + "loss": 0.5907, + "step": 6201 + }, + { + "epoch": 0.79, + "grad_norm": 1.260568618774414, + "learning_rate": 1.0661999657282968e-06, + "loss": 0.5627, + "step": 6202 + }, + { + "epoch": 0.79, + "grad_norm": 1.5022335052490234, + "learning_rate": 1.0649194746775576e-06, + "loss": 0.6852, + "step": 6203 + }, + { + "epoch": 0.79, + "grad_norm": 1.435604453086853, + "learning_rate": 1.063639661364454e-06, + "loss": 0.6222, + "step": 6204 + }, + { + "epoch": 0.79, + "grad_norm": 1.1896655559539795, + "learning_rate": 1.0623605260094105e-06, + "loss": 0.5527, + "step": 6205 + }, + { + "epoch": 0.8, + "grad_norm": 1.4578163623809814, + "learning_rate": 1.0610820688327323e-06, + "loss": 0.6038, + "step": 6206 + }, + { + "epoch": 0.8, + "grad_norm": 1.404301643371582, + "learning_rate": 1.0598042900546051e-06, + "loss": 0.554, + "step": 6207 + }, + { + "epoch": 0.8, + "grad_norm": 1.2354761362075806, + "learning_rate": 1.058527189895101e-06, + "loss": 0.6951, + "step": 6208 + }, + { + "epoch": 0.8, + "grad_norm": 1.9446297883987427, + "learning_rate": 1.0572507685741767e-06, + "loss": 0.5824, + "step": 6209 + }, + { + "epoch": 0.8, + "grad_norm": 1.273754596710205, + "learning_rate": 1.055975026311667e-06, + "loss": 0.6316, + "step": 6210 + }, + { + "epoch": 0.8, + "grad_norm": 1.49468195438385, + "learning_rate": 1.0546999633272935e-06, + "loss": 0.6304, + "step": 6211 + }, + { + "epoch": 0.8, + "grad_norm": 1.2683830261230469, + "learning_rate": 1.0534255798406602e-06, + "loss": 0.5711, + "step": 6212 + }, + { + "epoch": 0.8, + "grad_norm": 1.4558610916137695, + "learning_rate": 1.0521518760712545e-06, + "loss": 0.5594, + "step": 6213 + }, + { + "epoch": 0.8, + "grad_norm": 1.4848997592926025, + "learning_rate": 1.0508788522384443e-06, + "loss": 0.5832, + "step": 6214 + }, + { + "epoch": 0.8, + "grad_norm": 1.1660897731781006, + "learning_rate": 1.0496065085614827e-06, + "loss": 0.5497, + "step": 6215 + }, + { + "epoch": 0.8, + "grad_norm": 1.3938348293304443, + "learning_rate": 1.0483348452595056e-06, + "loss": 0.6248, + "step": 6216 + }, + { + "epoch": 0.8, + "grad_norm": 1.2313079833984375, + "learning_rate": 1.0470638625515316e-06, + "loss": 0.549, + "step": 6217 + }, + { + "epoch": 0.8, + "grad_norm": 1.4544310569763184, + "learning_rate": 1.0457935606564596e-06, + "loss": 0.5853, + "step": 6218 + }, + { + "epoch": 0.8, + "grad_norm": 1.7911359071731567, + "learning_rate": 1.044523939793075e-06, + "loss": 0.6357, + "step": 6219 + }, + { + "epoch": 0.8, + "grad_norm": 1.5014184713363647, + "learning_rate": 1.0432550001800435e-06, + "loss": 0.6137, + "step": 6220 + }, + { + "epoch": 0.8, + "grad_norm": 1.3622461557388306, + "learning_rate": 1.0419867420359141e-06, + "loss": 0.5823, + "step": 6221 + }, + { + "epoch": 0.8, + "grad_norm": 1.2310843467712402, + "learning_rate": 1.0407191655791188e-06, + "loss": 0.5025, + "step": 6222 + }, + { + "epoch": 0.8, + "grad_norm": 1.638079047203064, + "learning_rate": 1.039452271027972e-06, + "loss": 0.6757, + "step": 6223 + }, + { + "epoch": 0.8, + "grad_norm": 1.6458463668823242, + "learning_rate": 1.038186058600672e-06, + "loss": 0.4908, + "step": 6224 + }, + { + "epoch": 0.8, + "grad_norm": 1.4010100364685059, + "learning_rate": 1.036920528515295e-06, + "loss": 0.6098, + "step": 6225 + }, + { + "epoch": 0.8, + "grad_norm": 1.987982153892517, + "learning_rate": 1.0356556809898044e-06, + "loss": 0.59, + "step": 6226 + }, + { + "epoch": 0.8, + "grad_norm": 1.2834595441818237, + "learning_rate": 1.0343915162420443e-06, + "loss": 0.5335, + "step": 6227 + }, + { + "epoch": 0.8, + "grad_norm": 1.4950449466705322, + "learning_rate": 1.0331280344897425e-06, + "loss": 0.5466, + "step": 6228 + }, + { + "epoch": 0.8, + "grad_norm": 1.2922565937042236, + "learning_rate": 1.0318652359505065e-06, + "loss": 0.5139, + "step": 6229 + }, + { + "epoch": 0.8, + "grad_norm": 1.3083853721618652, + "learning_rate": 1.0306031208418277e-06, + "loss": 0.5349, + "step": 6230 + }, + { + "epoch": 0.8, + "grad_norm": 1.3209140300750732, + "learning_rate": 1.0293416893810803e-06, + "loss": 0.5987, + "step": 6231 + }, + { + "epoch": 0.8, + "grad_norm": 1.6582406759262085, + "learning_rate": 1.028080941785522e-06, + "loss": 0.6124, + "step": 6232 + }, + { + "epoch": 0.8, + "grad_norm": 1.5571374893188477, + "learning_rate": 1.0268208782722876e-06, + "loss": 0.6194, + "step": 6233 + }, + { + "epoch": 0.8, + "grad_norm": 6.914717197418213, + "learning_rate": 1.0255614990583995e-06, + "loss": 0.5848, + "step": 6234 + }, + { + "epoch": 0.8, + "grad_norm": 1.1963386535644531, + "learning_rate": 1.0243028043607588e-06, + "loss": 0.5973, + "step": 6235 + }, + { + "epoch": 0.8, + "grad_norm": 2.909210205078125, + "learning_rate": 1.023044794396153e-06, + "loss": 0.6383, + "step": 6236 + }, + { + "epoch": 0.8, + "grad_norm": 1.247591495513916, + "learning_rate": 1.0217874693812458e-06, + "loss": 0.643, + "step": 6237 + }, + { + "epoch": 0.8, + "grad_norm": 1.2099329233169556, + "learning_rate": 1.0205308295325844e-06, + "loss": 0.6963, + "step": 6238 + }, + { + "epoch": 0.8, + "grad_norm": 1.6623411178588867, + "learning_rate": 1.0192748750666053e-06, + "loss": 0.5453, + "step": 6239 + }, + { + "epoch": 0.8, + "grad_norm": 1.3051793575286865, + "learning_rate": 1.0180196061996162e-06, + "loss": 0.5942, + "step": 6240 + }, + { + "epoch": 0.8, + "grad_norm": 1.3945056200027466, + "learning_rate": 1.0167650231478132e-06, + "loss": 0.5567, + "step": 6241 + }, + { + "epoch": 0.8, + "grad_norm": 1.3888925313949585, + "learning_rate": 1.0155111261272726e-06, + "loss": 0.5803, + "step": 6242 + }, + { + "epoch": 0.8, + "grad_norm": 1.3073126077651978, + "learning_rate": 1.014257915353954e-06, + "loss": 0.5863, + "step": 6243 + }, + { + "epoch": 0.8, + "grad_norm": 1.203535556793213, + "learning_rate": 1.0130053910436955e-06, + "loss": 0.5271, + "step": 6244 + }, + { + "epoch": 0.8, + "grad_norm": 1.2935106754302979, + "learning_rate": 1.0117535534122191e-06, + "loss": 0.5767, + "step": 6245 + }, + { + "epoch": 0.8, + "grad_norm": 1.3383389711380005, + "learning_rate": 1.0105024026751292e-06, + "loss": 0.6398, + "step": 6246 + }, + { + "epoch": 0.8, + "grad_norm": 1.4551408290863037, + "learning_rate": 1.0092519390479116e-06, + "loss": 0.6012, + "step": 6247 + }, + { + "epoch": 0.8, + "grad_norm": 1.4838967323303223, + "learning_rate": 1.0080021627459319e-06, + "loss": 0.6087, + "step": 6248 + }, + { + "epoch": 0.8, + "grad_norm": 1.7100906372070312, + "learning_rate": 1.0067530739844394e-06, + "loss": 0.4806, + "step": 6249 + }, + { + "epoch": 0.8, + "grad_norm": 1.233081579208374, + "learning_rate": 1.005504672978564e-06, + "loss": 0.6828, + "step": 6250 + }, + { + "epoch": 0.8, + "grad_norm": 1.1398285627365112, + "learning_rate": 1.0042569599433183e-06, + "loss": 0.5084, + "step": 6251 + }, + { + "epoch": 0.8, + "grad_norm": 1.179826021194458, + "learning_rate": 1.003009935093594e-06, + "loss": 0.5361, + "step": 6252 + }, + { + "epoch": 0.8, + "grad_norm": 1.7407220602035522, + "learning_rate": 1.0017635986441664e-06, + "loss": 0.5924, + "step": 6253 + }, + { + "epoch": 0.8, + "grad_norm": 1.390802025794983, + "learning_rate": 1.0005179508096929e-06, + "loss": 0.6056, + "step": 6254 + }, + { + "epoch": 0.8, + "grad_norm": 1.1375401020050049, + "learning_rate": 9.992729918047085e-07, + "loss": 0.5618, + "step": 6255 + }, + { + "epoch": 0.8, + "grad_norm": 1.4758859872817993, + "learning_rate": 9.980287218436324e-07, + "loss": 0.6488, + "step": 6256 + }, + { + "epoch": 0.8, + "grad_norm": 4.653335094451904, + "learning_rate": 9.967851411407685e-07, + "loss": 0.686, + "step": 6257 + }, + { + "epoch": 0.8, + "grad_norm": 1.4219043254852295, + "learning_rate": 9.955422499102941e-07, + "loss": 0.5705, + "step": 6258 + }, + { + "epoch": 0.8, + "grad_norm": 1.2233293056488037, + "learning_rate": 9.943000483662735e-07, + "loss": 0.4663, + "step": 6259 + }, + { + "epoch": 0.8, + "grad_norm": 1.3139796257019043, + "learning_rate": 9.930585367226508e-07, + "loss": 0.6691, + "step": 6260 + }, + { + "epoch": 0.8, + "grad_norm": 1.184753179550171, + "learning_rate": 9.918177151932522e-07, + "loss": 0.5962, + "step": 6261 + }, + { + "epoch": 0.8, + "grad_norm": 1.4073246717453003, + "learning_rate": 9.90577583991782e-07, + "loss": 0.5851, + "step": 6262 + }, + { + "epoch": 0.8, + "grad_norm": 1.6594538688659668, + "learning_rate": 9.893381433318277e-07, + "loss": 0.6553, + "step": 6263 + }, + { + "epoch": 0.8, + "grad_norm": 1.5163381099700928, + "learning_rate": 9.88099393426859e-07, + "loss": 0.6235, + "step": 6264 + }, + { + "epoch": 0.8, + "grad_norm": 1.2041316032409668, + "learning_rate": 9.868613344902262e-07, + "loss": 0.5982, + "step": 6265 + }, + { + "epoch": 0.8, + "grad_norm": 1.3843533992767334, + "learning_rate": 9.856239667351569e-07, + "loss": 0.5743, + "step": 6266 + }, + { + "epoch": 0.8, + "grad_norm": 1.2988375425338745, + "learning_rate": 9.843872903747641e-07, + "loss": 0.6007, + "step": 6267 + }, + { + "epoch": 0.8, + "grad_norm": 1.8280378580093384, + "learning_rate": 9.831513056220405e-07, + "loss": 0.5599, + "step": 6268 + }, + { + "epoch": 0.8, + "grad_norm": 1.389329433441162, + "learning_rate": 9.819160126898598e-07, + "loss": 0.568, + "step": 6269 + }, + { + "epoch": 0.8, + "grad_norm": 4.785888195037842, + "learning_rate": 9.806814117909742e-07, + "loss": 0.6587, + "step": 6270 + }, + { + "epoch": 0.8, + "grad_norm": 1.6062169075012207, + "learning_rate": 9.794475031380195e-07, + "loss": 0.6054, + "step": 6271 + }, + { + "epoch": 0.8, + "grad_norm": 1.4643789529800415, + "learning_rate": 9.782142869435119e-07, + "loss": 0.5829, + "step": 6272 + }, + { + "epoch": 0.8, + "grad_norm": 1.3099743127822876, + "learning_rate": 9.769817634198475e-07, + "loss": 0.611, + "step": 6273 + }, + { + "epoch": 0.8, + "grad_norm": 1.5942202806472778, + "learning_rate": 9.757499327793035e-07, + "loss": 0.5887, + "step": 6274 + }, + { + "epoch": 0.8, + "grad_norm": 3.8324639797210693, + "learning_rate": 9.745187952340374e-07, + "loss": 0.5849, + "step": 6275 + }, + { + "epoch": 0.8, + "grad_norm": 1.438779592514038, + "learning_rate": 9.732883509960895e-07, + "loss": 0.5977, + "step": 6276 + }, + { + "epoch": 0.8, + "grad_norm": 1.2395045757293701, + "learning_rate": 9.720586002773757e-07, + "loss": 0.5873, + "step": 6277 + }, + { + "epoch": 0.8, + "grad_norm": 1.4779880046844482, + "learning_rate": 9.70829543289697e-07, + "loss": 0.5937, + "step": 6278 + }, + { + "epoch": 0.8, + "grad_norm": 1.641114592552185, + "learning_rate": 9.696011802447337e-07, + "loss": 0.5943, + "step": 6279 + }, + { + "epoch": 0.8, + "grad_norm": 1.8073025941848755, + "learning_rate": 9.683735113540471e-07, + "loss": 0.5827, + "step": 6280 + }, + { + "epoch": 0.8, + "grad_norm": 1.2064876556396484, + "learning_rate": 9.671465368290761e-07, + "loss": 0.587, + "step": 6281 + }, + { + "epoch": 0.8, + "grad_norm": 1.4570554494857788, + "learning_rate": 9.659202568811437e-07, + "loss": 0.5751, + "step": 6282 + }, + { + "epoch": 0.8, + "grad_norm": 1.5656616687774658, + "learning_rate": 9.64694671721451e-07, + "loss": 0.5457, + "step": 6283 + }, + { + "epoch": 0.81, + "grad_norm": 1.6131380796432495, + "learning_rate": 9.634697815610815e-07, + "loss": 0.5962, + "step": 6284 + }, + { + "epoch": 0.81, + "grad_norm": 1.3118836879730225, + "learning_rate": 9.622455866109958e-07, + "loss": 0.5983, + "step": 6285 + }, + { + "epoch": 0.81, + "grad_norm": 1.4274793863296509, + "learning_rate": 9.610220870820374e-07, + "loss": 0.6167, + "step": 6286 + }, + { + "epoch": 0.81, + "grad_norm": 1.510737419128418, + "learning_rate": 9.59799283184929e-07, + "loss": 0.6158, + "step": 6287 + }, + { + "epoch": 0.81, + "grad_norm": 1.8258140087127686, + "learning_rate": 9.585771751302753e-07, + "loss": 0.6055, + "step": 6288 + }, + { + "epoch": 0.81, + "grad_norm": 1.1926887035369873, + "learning_rate": 9.573557631285568e-07, + "loss": 0.687, + "step": 6289 + }, + { + "epoch": 0.81, + "grad_norm": 1.6286548376083374, + "learning_rate": 9.56135047390137e-07, + "loss": 0.6492, + "step": 6290 + }, + { + "epoch": 0.81, + "grad_norm": 1.3712109327316284, + "learning_rate": 9.549150281252633e-07, + "loss": 0.5782, + "step": 6291 + }, + { + "epoch": 0.81, + "grad_norm": 1.3473654985427856, + "learning_rate": 9.536957055440555e-07, + "loss": 0.6219, + "step": 6292 + }, + { + "epoch": 0.81, + "grad_norm": 1.4779305458068848, + "learning_rate": 9.524770798565174e-07, + "loss": 0.5425, + "step": 6293 + }, + { + "epoch": 0.81, + "grad_norm": 1.329278826713562, + "learning_rate": 9.512591512725344e-07, + "loss": 0.6175, + "step": 6294 + }, + { + "epoch": 0.81, + "grad_norm": 1.6562693119049072, + "learning_rate": 9.500419200018695e-07, + "loss": 0.5724, + "step": 6295 + }, + { + "epoch": 0.81, + "grad_norm": 1.2669323682785034, + "learning_rate": 9.488253862541641e-07, + "loss": 0.5622, + "step": 6296 + }, + { + "epoch": 0.81, + "grad_norm": 1.161300539970398, + "learning_rate": 9.476095502389432e-07, + "loss": 0.5179, + "step": 6297 + }, + { + "epoch": 0.81, + "grad_norm": 1.5029854774475098, + "learning_rate": 9.463944121656088e-07, + "loss": 0.6479, + "step": 6298 + }, + { + "epoch": 0.81, + "grad_norm": 2.1170594692230225, + "learning_rate": 9.451799722434462e-07, + "loss": 0.5316, + "step": 6299 + }, + { + "epoch": 0.81, + "grad_norm": 1.3949368000030518, + "learning_rate": 9.439662306816144e-07, + "loss": 0.6038, + "step": 6300 + }, + { + "epoch": 0.81, + "grad_norm": 1.2971605062484741, + "learning_rate": 9.427531876891577e-07, + "loss": 0.618, + "step": 6301 + }, + { + "epoch": 0.81, + "grad_norm": 1.5163462162017822, + "learning_rate": 9.415408434749984e-07, + "loss": 0.6335, + "step": 6302 + }, + { + "epoch": 0.81, + "grad_norm": 1.2938801050186157, + "learning_rate": 9.403291982479368e-07, + "loss": 0.6646, + "step": 6303 + }, + { + "epoch": 0.81, + "grad_norm": 1.369942545890808, + "learning_rate": 9.391182522166548e-07, + "loss": 0.5532, + "step": 6304 + }, + { + "epoch": 0.81, + "grad_norm": 1.317028284072876, + "learning_rate": 9.379080055897133e-07, + "loss": 0.5684, + "step": 6305 + }, + { + "epoch": 0.81, + "grad_norm": 1.504163146018982, + "learning_rate": 9.366984585755545e-07, + "loss": 0.6464, + "step": 6306 + }, + { + "epoch": 0.81, + "grad_norm": 1.22713041305542, + "learning_rate": 9.354896113824935e-07, + "loss": 0.6153, + "step": 6307 + }, + { + "epoch": 0.81, + "grad_norm": 1.341958999633789, + "learning_rate": 9.342814642187336e-07, + "loss": 0.5821, + "step": 6308 + }, + { + "epoch": 0.81, + "grad_norm": 1.6462088823318481, + "learning_rate": 9.330740172923547e-07, + "loss": 0.4702, + "step": 6309 + }, + { + "epoch": 0.81, + "grad_norm": 1.349832534790039, + "learning_rate": 9.318672708113113e-07, + "loss": 0.6108, + "step": 6310 + }, + { + "epoch": 0.81, + "grad_norm": 1.3375132083892822, + "learning_rate": 9.306612249834429e-07, + "loss": 0.6344, + "step": 6311 + }, + { + "epoch": 0.81, + "grad_norm": 1.3548191785812378, + "learning_rate": 9.294558800164655e-07, + "loss": 0.5802, + "step": 6312 + }, + { + "epoch": 0.81, + "grad_norm": 1.3934367895126343, + "learning_rate": 9.282512361179779e-07, + "loss": 0.509, + "step": 6313 + }, + { + "epoch": 0.81, + "grad_norm": 1.4095515012741089, + "learning_rate": 9.270472934954516e-07, + "loss": 0.5798, + "step": 6314 + }, + { + "epoch": 0.81, + "grad_norm": 1.2284436225891113, + "learning_rate": 9.258440523562434e-07, + "loss": 0.6162, + "step": 6315 + }, + { + "epoch": 0.81, + "grad_norm": 1.4372409582138062, + "learning_rate": 9.246415129075869e-07, + "loss": 0.6458, + "step": 6316 + }, + { + "epoch": 0.81, + "grad_norm": 1.468173861503601, + "learning_rate": 9.234396753565966e-07, + "loss": 0.5612, + "step": 6317 + }, + { + "epoch": 0.81, + "grad_norm": 1.2699594497680664, + "learning_rate": 9.222385399102613e-07, + "loss": 0.7351, + "step": 6318 + }, + { + "epoch": 0.81, + "grad_norm": 1.6350229978561401, + "learning_rate": 9.210381067754542e-07, + "loss": 0.5474, + "step": 6319 + }, + { + "epoch": 0.81, + "grad_norm": 1.289797306060791, + "learning_rate": 9.198383761589247e-07, + "loss": 0.5446, + "step": 6320 + }, + { + "epoch": 0.81, + "grad_norm": 1.2473235130310059, + "learning_rate": 9.186393482673045e-07, + "loss": 0.5478, + "step": 6321 + }, + { + "epoch": 0.81, + "grad_norm": 3.0618834495544434, + "learning_rate": 9.17441023307098e-07, + "loss": 0.6916, + "step": 6322 + }, + { + "epoch": 0.81, + "grad_norm": 1.326453447341919, + "learning_rate": 9.162434014846944e-07, + "loss": 0.6769, + "step": 6323 + }, + { + "epoch": 0.81, + "grad_norm": 1.895493745803833, + "learning_rate": 9.150464830063593e-07, + "loss": 0.5819, + "step": 6324 + }, + { + "epoch": 0.81, + "grad_norm": 1.2672816514968872, + "learning_rate": 9.13850268078238e-07, + "loss": 0.6084, + "step": 6325 + }, + { + "epoch": 0.81, + "grad_norm": 1.3241233825683594, + "learning_rate": 9.126547569063538e-07, + "loss": 0.5891, + "step": 6326 + }, + { + "epoch": 0.81, + "grad_norm": 1.417624831199646, + "learning_rate": 9.114599496966093e-07, + "loss": 0.5614, + "step": 6327 + }, + { + "epoch": 0.81, + "grad_norm": 1.364613652229309, + "learning_rate": 9.10265846654787e-07, + "loss": 0.5463, + "step": 6328 + }, + { + "epoch": 0.81, + "grad_norm": 1.3848798274993896, + "learning_rate": 9.090724479865443e-07, + "loss": 0.6403, + "step": 6329 + }, + { + "epoch": 0.81, + "grad_norm": 1.2594083547592163, + "learning_rate": 9.07879753897421e-07, + "loss": 0.5747, + "step": 6330 + }, + { + "epoch": 0.81, + "grad_norm": 1.2425966262817383, + "learning_rate": 9.066877645928351e-07, + "loss": 0.5759, + "step": 6331 + }, + { + "epoch": 0.81, + "grad_norm": 1.5962731838226318, + "learning_rate": 9.054964802780836e-07, + "loss": 0.6179, + "step": 6332 + }, + { + "epoch": 0.81, + "grad_norm": 1.6983554363250732, + "learning_rate": 9.043059011583378e-07, + "loss": 0.529, + "step": 6333 + }, + { + "epoch": 0.81, + "grad_norm": 1.4406030178070068, + "learning_rate": 9.031160274386524e-07, + "loss": 0.5584, + "step": 6334 + }, + { + "epoch": 0.81, + "grad_norm": 1.1120250225067139, + "learning_rate": 9.019268593239589e-07, + "loss": 0.5092, + "step": 6335 + }, + { + "epoch": 0.81, + "grad_norm": 1.3159717321395874, + "learning_rate": 9.007383970190692e-07, + "loss": 0.5571, + "step": 6336 + }, + { + "epoch": 0.81, + "grad_norm": 1.4577220678329468, + "learning_rate": 8.995506407286681e-07, + "loss": 0.5735, + "step": 6337 + }, + { + "epoch": 0.81, + "grad_norm": 2.6757638454437256, + "learning_rate": 8.983635906573246e-07, + "loss": 0.6404, + "step": 6338 + }, + { + "epoch": 0.81, + "grad_norm": 1.3575419187545776, + "learning_rate": 8.971772470094841e-07, + "loss": 0.5763, + "step": 6339 + }, + { + "epoch": 0.81, + "grad_norm": 1.1019004583358765, + "learning_rate": 8.959916099894706e-07, + "loss": 0.6899, + "step": 6340 + }, + { + "epoch": 0.81, + "grad_norm": 1.2935214042663574, + "learning_rate": 8.948066798014832e-07, + "loss": 0.6393, + "step": 6341 + }, + { + "epoch": 0.81, + "grad_norm": 1.3130396604537964, + "learning_rate": 8.936224566496049e-07, + "loss": 0.5713, + "step": 6342 + }, + { + "epoch": 0.81, + "grad_norm": 1.318345308303833, + "learning_rate": 8.924389407377948e-07, + "loss": 0.544, + "step": 6343 + }, + { + "epoch": 0.81, + "grad_norm": 1.3377659320831299, + "learning_rate": 8.912561322698859e-07, + "loss": 0.5473, + "step": 6344 + }, + { + "epoch": 0.81, + "grad_norm": 1.4414652585983276, + "learning_rate": 8.900740314495953e-07, + "loss": 0.596, + "step": 6345 + }, + { + "epoch": 0.81, + "grad_norm": 1.5321376323699951, + "learning_rate": 8.888926384805158e-07, + "loss": 0.5581, + "step": 6346 + }, + { + "epoch": 0.81, + "grad_norm": 1.2531845569610596, + "learning_rate": 8.877119535661189e-07, + "loss": 0.5722, + "step": 6347 + }, + { + "epoch": 0.81, + "grad_norm": 1.3081930875778198, + "learning_rate": 8.865319769097513e-07, + "loss": 0.5443, + "step": 6348 + }, + { + "epoch": 0.81, + "grad_norm": 1.565206527709961, + "learning_rate": 8.853527087146413e-07, + "loss": 0.6366, + "step": 6349 + }, + { + "epoch": 0.81, + "grad_norm": 1.6505012512207031, + "learning_rate": 8.841741491838951e-07, + "loss": 0.6175, + "step": 6350 + }, + { + "epoch": 0.81, + "grad_norm": 1.3950464725494385, + "learning_rate": 8.829962985204931e-07, + "loss": 0.607, + "step": 6351 + }, + { + "epoch": 0.81, + "grad_norm": 1.5542408227920532, + "learning_rate": 8.818191569272977e-07, + "loss": 0.6109, + "step": 6352 + }, + { + "epoch": 0.81, + "grad_norm": 1.618653416633606, + "learning_rate": 8.806427246070476e-07, + "loss": 0.5526, + "step": 6353 + }, + { + "epoch": 0.81, + "grad_norm": 1.473083734512329, + "learning_rate": 8.794670017623602e-07, + "loss": 0.6048, + "step": 6354 + }, + { + "epoch": 0.81, + "grad_norm": 1.2885507345199585, + "learning_rate": 8.782919885957276e-07, + "loss": 0.6216, + "step": 6355 + }, + { + "epoch": 0.81, + "grad_norm": 1.6841661930084229, + "learning_rate": 8.771176853095226e-07, + "loss": 0.6373, + "step": 6356 + }, + { + "epoch": 0.81, + "grad_norm": 2.770217180252075, + "learning_rate": 8.759440921059958e-07, + "loss": 0.6398, + "step": 6357 + }, + { + "epoch": 0.81, + "grad_norm": 1.275274395942688, + "learning_rate": 8.747712091872751e-07, + "loss": 0.71, + "step": 6358 + }, + { + "epoch": 0.81, + "grad_norm": 2.0190229415893555, + "learning_rate": 8.735990367553643e-07, + "loss": 0.5194, + "step": 6359 + }, + { + "epoch": 0.81, + "grad_norm": 1.600679636001587, + "learning_rate": 8.72427575012148e-07, + "loss": 0.5969, + "step": 6360 + }, + { + "epoch": 0.81, + "grad_norm": 1.613568663597107, + "learning_rate": 8.712568241593866e-07, + "loss": 0.5324, + "step": 6361 + }, + { + "epoch": 0.82, + "grad_norm": 1.3485932350158691, + "learning_rate": 8.700867843987165e-07, + "loss": 0.5907, + "step": 6362 + }, + { + "epoch": 0.82, + "grad_norm": 1.446373462677002, + "learning_rate": 8.689174559316544e-07, + "loss": 0.5736, + "step": 6363 + }, + { + "epoch": 0.82, + "grad_norm": 1.2776132822036743, + "learning_rate": 8.677488389595923e-07, + "loss": 0.4848, + "step": 6364 + }, + { + "epoch": 0.82, + "grad_norm": 1.2803000211715698, + "learning_rate": 8.665809336838032e-07, + "loss": 0.5473, + "step": 6365 + }, + { + "epoch": 0.82, + "grad_norm": 1.1983240842819214, + "learning_rate": 8.654137403054324e-07, + "loss": 0.5795, + "step": 6366 + }, + { + "epoch": 0.82, + "grad_norm": 1.4812618494033813, + "learning_rate": 8.642472590255063e-07, + "loss": 0.6122, + "step": 6367 + }, + { + "epoch": 0.82, + "grad_norm": 1.6410510540008545, + "learning_rate": 8.630814900449269e-07, + "loss": 0.6678, + "step": 6368 + }, + { + "epoch": 0.82, + "grad_norm": 1.134711742401123, + "learning_rate": 8.619164335644764e-07, + "loss": 0.4751, + "step": 6369 + }, + { + "epoch": 0.82, + "grad_norm": 1.3798121213912964, + "learning_rate": 8.607520897848093e-07, + "loss": 0.6079, + "step": 6370 + }, + { + "epoch": 0.82, + "grad_norm": 1.5486456155776978, + "learning_rate": 8.595884589064618e-07, + "loss": 0.5501, + "step": 6371 + }, + { + "epoch": 0.82, + "grad_norm": 1.4457107782363892, + "learning_rate": 8.584255411298448e-07, + "loss": 0.5237, + "step": 6372 + }, + { + "epoch": 0.82, + "grad_norm": 1.3767578601837158, + "learning_rate": 8.57263336655249e-07, + "loss": 0.5954, + "step": 6373 + }, + { + "epoch": 0.82, + "grad_norm": 1.325913429260254, + "learning_rate": 8.561018456828379e-07, + "loss": 0.5758, + "step": 6374 + }, + { + "epoch": 0.82, + "grad_norm": 1.5465365648269653, + "learning_rate": 8.549410684126547e-07, + "loss": 0.5237, + "step": 6375 + }, + { + "epoch": 0.82, + "grad_norm": 1.5502859354019165, + "learning_rate": 8.537810050446238e-07, + "loss": 0.5968, + "step": 6376 + }, + { + "epoch": 0.82, + "grad_norm": 1.3833034038543701, + "learning_rate": 8.526216557785383e-07, + "loss": 0.4794, + "step": 6377 + }, + { + "epoch": 0.82, + "grad_norm": 1.4041199684143066, + "learning_rate": 8.51463020814074e-07, + "loss": 0.5775, + "step": 6378 + }, + { + "epoch": 0.82, + "grad_norm": 1.3076719045639038, + "learning_rate": 8.503051003507823e-07, + "loss": 0.5465, + "step": 6379 + }, + { + "epoch": 0.82, + "grad_norm": 1.535282015800476, + "learning_rate": 8.49147894588092e-07, + "loss": 0.6229, + "step": 6380 + }, + { + "epoch": 0.82, + "grad_norm": 1.4696766138076782, + "learning_rate": 8.479914037253073e-07, + "loss": 0.5713, + "step": 6381 + }, + { + "epoch": 0.82, + "grad_norm": 1.4727602005004883, + "learning_rate": 8.468356279616102e-07, + "loss": 0.5585, + "step": 6382 + }, + { + "epoch": 0.82, + "grad_norm": 1.4697345495224, + "learning_rate": 8.456805674960594e-07, + "loss": 0.5728, + "step": 6383 + }, + { + "epoch": 0.82, + "grad_norm": 1.5066434144973755, + "learning_rate": 8.44526222527593e-07, + "loss": 0.6151, + "step": 6384 + }, + { + "epoch": 0.82, + "grad_norm": 1.423194408416748, + "learning_rate": 8.433725932550207e-07, + "loss": 0.5928, + "step": 6385 + }, + { + "epoch": 0.82, + "grad_norm": 1.6643799543380737, + "learning_rate": 8.422196798770321e-07, + "loss": 0.6388, + "step": 6386 + }, + { + "epoch": 0.82, + "grad_norm": 1.3613812923431396, + "learning_rate": 8.410674825921938e-07, + "loss": 0.5786, + "step": 6387 + }, + { + "epoch": 0.82, + "grad_norm": 1.1162885427474976, + "learning_rate": 8.399160015989494e-07, + "loss": 0.5944, + "step": 6388 + }, + { + "epoch": 0.82, + "grad_norm": 1.5724189281463623, + "learning_rate": 8.387652370956151e-07, + "loss": 0.5654, + "step": 6389 + }, + { + "epoch": 0.82, + "grad_norm": 1.359290361404419, + "learning_rate": 8.37615189280389e-07, + "loss": 0.5796, + "step": 6390 + }, + { + "epoch": 0.82, + "grad_norm": 1.3778443336486816, + "learning_rate": 8.36465858351343e-07, + "loss": 0.5938, + "step": 6391 + }, + { + "epoch": 0.82, + "grad_norm": 1.4014848470687866, + "learning_rate": 8.353172445064268e-07, + "loss": 0.5879, + "step": 6392 + }, + { + "epoch": 0.82, + "grad_norm": 1.405988097190857, + "learning_rate": 8.341693479434626e-07, + "loss": 0.5998, + "step": 6393 + }, + { + "epoch": 0.82, + "grad_norm": 1.4378615617752075, + "learning_rate": 8.330221688601559e-07, + "loss": 0.6279, + "step": 6394 + }, + { + "epoch": 0.82, + "grad_norm": 1.794307827949524, + "learning_rate": 8.318757074540845e-07, + "loss": 0.6281, + "step": 6395 + }, + { + "epoch": 0.82, + "grad_norm": 1.450239896774292, + "learning_rate": 8.307299639227013e-07, + "loss": 0.5631, + "step": 6396 + }, + { + "epoch": 0.82, + "grad_norm": 1.587395191192627, + "learning_rate": 8.295849384633381e-07, + "loss": 0.5966, + "step": 6397 + }, + { + "epoch": 0.82, + "grad_norm": 1.633143663406372, + "learning_rate": 8.284406312732024e-07, + "loss": 0.587, + "step": 6398 + }, + { + "epoch": 0.82, + "grad_norm": 1.2260631322860718, + "learning_rate": 8.272970425493793e-07, + "loss": 0.5618, + "step": 6399 + }, + { + "epoch": 0.82, + "grad_norm": 1.3946280479431152, + "learning_rate": 8.261541724888256e-07, + "loss": 0.5446, + "step": 6400 + }, + { + "epoch": 0.82, + "grad_norm": 1.1478197574615479, + "learning_rate": 8.250120212883794e-07, + "loss": 0.5878, + "step": 6401 + }, + { + "epoch": 0.82, + "grad_norm": 1.6904059648513794, + "learning_rate": 8.23870589144754e-07, + "loss": 0.6042, + "step": 6402 + }, + { + "epoch": 0.82, + "grad_norm": 1.5287965536117554, + "learning_rate": 8.227298762545354e-07, + "loss": 0.6247, + "step": 6403 + }, + { + "epoch": 0.82, + "grad_norm": 1.4237464666366577, + "learning_rate": 8.215898828141894e-07, + "loss": 0.5744, + "step": 6404 + }, + { + "epoch": 0.82, + "grad_norm": 1.4170470237731934, + "learning_rate": 8.204506090200565e-07, + "loss": 0.5751, + "step": 6405 + }, + { + "epoch": 0.82, + "grad_norm": 1.3677936792373657, + "learning_rate": 8.193120550683553e-07, + "loss": 0.6154, + "step": 6406 + }, + { + "epoch": 0.82, + "grad_norm": 1.2820465564727783, + "learning_rate": 8.181742211551757e-07, + "loss": 0.5579, + "step": 6407 + }, + { + "epoch": 0.82, + "grad_norm": 1.6385856866836548, + "learning_rate": 8.170371074764872e-07, + "loss": 0.6446, + "step": 6408 + }, + { + "epoch": 0.82, + "grad_norm": 1.253929615020752, + "learning_rate": 8.159007142281356e-07, + "loss": 0.55, + "step": 6409 + }, + { + "epoch": 0.82, + "grad_norm": 1.2222574949264526, + "learning_rate": 8.147650416058406e-07, + "loss": 0.5793, + "step": 6410 + }, + { + "epoch": 0.82, + "grad_norm": 1.3179538249969482, + "learning_rate": 8.136300898051996e-07, + "loss": 0.616, + "step": 6411 + }, + { + "epoch": 0.82, + "grad_norm": 1.3393512964248657, + "learning_rate": 8.124958590216841e-07, + "loss": 0.6454, + "step": 6412 + }, + { + "epoch": 0.82, + "grad_norm": 1.482521414756775, + "learning_rate": 8.11362349450644e-07, + "loss": 0.5641, + "step": 6413 + }, + { + "epoch": 0.82, + "grad_norm": 1.3834075927734375, + "learning_rate": 8.102295612873007e-07, + "loss": 0.5252, + "step": 6414 + }, + { + "epoch": 0.82, + "grad_norm": 1.378448724746704, + "learning_rate": 8.090974947267555e-07, + "loss": 0.5853, + "step": 6415 + }, + { + "epoch": 0.82, + "grad_norm": 1.5157331228256226, + "learning_rate": 8.07966149963983e-07, + "loss": 0.6003, + "step": 6416 + }, + { + "epoch": 0.82, + "grad_norm": 1.1425423622131348, + "learning_rate": 8.068355271938366e-07, + "loss": 0.7038, + "step": 6417 + }, + { + "epoch": 0.82, + "grad_norm": 1.4335917234420776, + "learning_rate": 8.057056266110397e-07, + "loss": 0.6066, + "step": 6418 + }, + { + "epoch": 0.82, + "grad_norm": 1.3131961822509766, + "learning_rate": 8.04576448410197e-07, + "loss": 0.4857, + "step": 6419 + }, + { + "epoch": 0.82, + "grad_norm": 1.3881715536117554, + "learning_rate": 8.034479927857852e-07, + "loss": 0.6013, + "step": 6420 + }, + { + "epoch": 0.82, + "grad_norm": 1.620413064956665, + "learning_rate": 8.023202599321605e-07, + "loss": 0.6436, + "step": 6421 + }, + { + "epoch": 0.82, + "grad_norm": 1.1148796081542969, + "learning_rate": 8.011932500435482e-07, + "loss": 0.6773, + "step": 6422 + }, + { + "epoch": 0.82, + "grad_norm": 1.3660540580749512, + "learning_rate": 8.000669633140551e-07, + "loss": 0.6043, + "step": 6423 + }, + { + "epoch": 0.82, + "grad_norm": 1.3749091625213623, + "learning_rate": 7.989413999376605e-07, + "loss": 0.5466, + "step": 6424 + }, + { + "epoch": 0.82, + "grad_norm": 1.1317570209503174, + "learning_rate": 7.978165601082211e-07, + "loss": 0.4955, + "step": 6425 + }, + { + "epoch": 0.82, + "grad_norm": 1.1026514768600464, + "learning_rate": 7.966924440194657e-07, + "loss": 0.5888, + "step": 6426 + }, + { + "epoch": 0.82, + "grad_norm": 1.2896405458450317, + "learning_rate": 7.95569051865e-07, + "loss": 0.5944, + "step": 6427 + }, + { + "epoch": 0.82, + "grad_norm": 1.2608617544174194, + "learning_rate": 7.944463838383093e-07, + "loss": 0.5025, + "step": 6428 + }, + { + "epoch": 0.82, + "grad_norm": 1.1657538414001465, + "learning_rate": 7.933244401327472e-07, + "loss": 0.6009, + "step": 6429 + }, + { + "epoch": 0.82, + "grad_norm": 2.1015689373016357, + "learning_rate": 7.922032209415459e-07, + "loss": 0.6016, + "step": 6430 + }, + { + "epoch": 0.82, + "grad_norm": 1.2412078380584717, + "learning_rate": 7.91082726457813e-07, + "loss": 0.5762, + "step": 6431 + }, + { + "epoch": 0.82, + "grad_norm": 1.2768174409866333, + "learning_rate": 7.899629568745327e-07, + "loss": 0.5765, + "step": 6432 + }, + { + "epoch": 0.82, + "grad_norm": 1.256107211112976, + "learning_rate": 7.888439123845599e-07, + "loss": 0.6115, + "step": 6433 + }, + { + "epoch": 0.82, + "grad_norm": 2.3772521018981934, + "learning_rate": 7.877255931806277e-07, + "loss": 0.546, + "step": 6434 + }, + { + "epoch": 0.82, + "grad_norm": 1.5032480955123901, + "learning_rate": 7.866079994553444e-07, + "loss": 0.6249, + "step": 6435 + }, + { + "epoch": 0.82, + "grad_norm": 1.3456975221633911, + "learning_rate": 7.854911314011942e-07, + "loss": 0.5095, + "step": 6436 + }, + { + "epoch": 0.82, + "grad_norm": 1.1659880876541138, + "learning_rate": 7.843749892105323e-07, + "loss": 0.5498, + "step": 6437 + }, + { + "epoch": 0.82, + "grad_norm": 1.2211744785308838, + "learning_rate": 7.832595730755927e-07, + "loss": 0.5963, + "step": 6438 + }, + { + "epoch": 0.82, + "grad_norm": 1.3915050029754639, + "learning_rate": 7.821448831884831e-07, + "loss": 0.6141, + "step": 6439 + }, + { + "epoch": 0.83, + "grad_norm": 2.694148302078247, + "learning_rate": 7.810309197411875e-07, + "loss": 0.6192, + "step": 6440 + }, + { + "epoch": 0.83, + "grad_norm": 1.6715288162231445, + "learning_rate": 7.799176829255612e-07, + "loss": 0.6291, + "step": 6441 + }, + { + "epoch": 0.83, + "grad_norm": 1.2998833656311035, + "learning_rate": 7.788051729333373e-07, + "loss": 0.6008, + "step": 6442 + }, + { + "epoch": 0.83, + "grad_norm": 1.360581874847412, + "learning_rate": 7.776933899561239e-07, + "loss": 0.5631, + "step": 6443 + }, + { + "epoch": 0.83, + "grad_norm": 1.3779425621032715, + "learning_rate": 7.765823341854017e-07, + "loss": 0.488, + "step": 6444 + }, + { + "epoch": 0.83, + "grad_norm": 1.4948135614395142, + "learning_rate": 7.754720058125293e-07, + "loss": 0.6333, + "step": 6445 + }, + { + "epoch": 0.83, + "grad_norm": 2.061363935470581, + "learning_rate": 7.743624050287363e-07, + "loss": 0.532, + "step": 6446 + }, + { + "epoch": 0.83, + "grad_norm": 1.2560065984725952, + "learning_rate": 7.732535320251316e-07, + "loss": 0.5488, + "step": 6447 + }, + { + "epoch": 0.83, + "grad_norm": 1.488149642944336, + "learning_rate": 7.721453869926926e-07, + "loss": 0.5659, + "step": 6448 + }, + { + "epoch": 0.83, + "grad_norm": 1.2953224182128906, + "learning_rate": 7.710379701222764e-07, + "loss": 0.6267, + "step": 6449 + }, + { + "epoch": 0.83, + "grad_norm": 2.0666239261627197, + "learning_rate": 7.699312816046139e-07, + "loss": 0.6492, + "step": 6450 + }, + { + "epoch": 0.83, + "grad_norm": 1.348408579826355, + "learning_rate": 7.688253216303082e-07, + "loss": 0.5643, + "step": 6451 + }, + { + "epoch": 0.83, + "grad_norm": 1.6481759548187256, + "learning_rate": 7.677200903898386e-07, + "loss": 0.6163, + "step": 6452 + }, + { + "epoch": 0.83, + "grad_norm": 1.656936526298523, + "learning_rate": 7.666155880735593e-07, + "loss": 0.6407, + "step": 6453 + }, + { + "epoch": 0.83, + "grad_norm": 1.1746383905410767, + "learning_rate": 7.655118148716989e-07, + "loss": 0.75, + "step": 6454 + }, + { + "epoch": 0.83, + "grad_norm": 1.4573662281036377, + "learning_rate": 7.644087709743586e-07, + "loss": 0.5225, + "step": 6455 + }, + { + "epoch": 0.83, + "grad_norm": 1.2147821187973022, + "learning_rate": 7.633064565715159e-07, + "loss": 0.7242, + "step": 6456 + }, + { + "epoch": 0.83, + "grad_norm": 1.5926049947738647, + "learning_rate": 7.622048718530218e-07, + "loss": 0.5808, + "step": 6457 + }, + { + "epoch": 0.83, + "grad_norm": 1.6897194385528564, + "learning_rate": 7.611040170086032e-07, + "loss": 0.6736, + "step": 6458 + }, + { + "epoch": 0.83, + "grad_norm": 1.5272244215011597, + "learning_rate": 7.60003892227858e-07, + "loss": 0.6596, + "step": 6459 + }, + { + "epoch": 0.83, + "grad_norm": 1.4822163581848145, + "learning_rate": 7.589044977002607e-07, + "loss": 0.677, + "step": 6460 + }, + { + "epoch": 0.83, + "grad_norm": 4.653491497039795, + "learning_rate": 7.578058336151611e-07, + "loss": 0.5817, + "step": 6461 + }, + { + "epoch": 0.83, + "grad_norm": 1.3933191299438477, + "learning_rate": 7.5670790016178e-07, + "loss": 0.6101, + "step": 6462 + }, + { + "epoch": 0.83, + "grad_norm": 1.2626433372497559, + "learning_rate": 7.556106975292155e-07, + "loss": 0.5215, + "step": 6463 + }, + { + "epoch": 0.83, + "grad_norm": 1.1545683145523071, + "learning_rate": 7.545142259064376e-07, + "loss": 0.5581, + "step": 6464 + }, + { + "epoch": 0.83, + "grad_norm": 1.349684476852417, + "learning_rate": 7.534184854822929e-07, + "loss": 0.7176, + "step": 6465 + }, + { + "epoch": 0.83, + "grad_norm": 1.2531545162200928, + "learning_rate": 7.523234764454978e-07, + "loss": 0.5055, + "step": 6466 + }, + { + "epoch": 0.83, + "grad_norm": 1.6439186334609985, + "learning_rate": 7.512291989846465e-07, + "loss": 0.5796, + "step": 6467 + }, + { + "epoch": 0.83, + "grad_norm": 1.1926203966140747, + "learning_rate": 7.501356532882064e-07, + "loss": 0.468, + "step": 6468 + }, + { + "epoch": 0.83, + "grad_norm": 1.6254560947418213, + "learning_rate": 7.490428395445198e-07, + "loss": 0.6552, + "step": 6469 + }, + { + "epoch": 0.83, + "grad_norm": 1.2992578744888306, + "learning_rate": 7.479507579417989e-07, + "loss": 0.648, + "step": 6470 + }, + { + "epoch": 0.83, + "grad_norm": 1.4693169593811035, + "learning_rate": 7.46859408668133e-07, + "loss": 0.5684, + "step": 6471 + }, + { + "epoch": 0.83, + "grad_norm": 1.5550191402435303, + "learning_rate": 7.457687919114864e-07, + "loss": 0.5953, + "step": 6472 + }, + { + "epoch": 0.83, + "grad_norm": 1.1891635656356812, + "learning_rate": 7.446789078596961e-07, + "loss": 0.4812, + "step": 6473 + }, + { + "epoch": 0.83, + "grad_norm": 1.6289045810699463, + "learning_rate": 7.435897567004696e-07, + "loss": 0.6572, + "step": 6474 + }, + { + "epoch": 0.83, + "grad_norm": 1.2836540937423706, + "learning_rate": 7.425013386213931e-07, + "loss": 0.5618, + "step": 6475 + }, + { + "epoch": 0.83, + "grad_norm": 2.0774881839752197, + "learning_rate": 7.414136538099242e-07, + "loss": 0.6091, + "step": 6476 + }, + { + "epoch": 0.83, + "grad_norm": 1.2852836847305298, + "learning_rate": 7.403267024533956e-07, + "loss": 0.584, + "step": 6477 + }, + { + "epoch": 0.83, + "grad_norm": 1.08048677444458, + "learning_rate": 7.392404847390089e-07, + "loss": 0.7044, + "step": 6478 + }, + { + "epoch": 0.83, + "grad_norm": 1.3461830615997314, + "learning_rate": 7.381550008538468e-07, + "loss": 0.6609, + "step": 6479 + }, + { + "epoch": 0.83, + "grad_norm": 1.3070403337478638, + "learning_rate": 7.37070250984862e-07, + "loss": 0.5704, + "step": 6480 + }, + { + "epoch": 0.83, + "grad_norm": 1.1511784791946411, + "learning_rate": 7.359862353188774e-07, + "loss": 0.6785, + "step": 6481 + }, + { + "epoch": 0.83, + "grad_norm": 1.5748064517974854, + "learning_rate": 7.34902954042595e-07, + "loss": 0.5357, + "step": 6482 + }, + { + "epoch": 0.83, + "grad_norm": 1.5832569599151611, + "learning_rate": 7.33820407342587e-07, + "loss": 0.6216, + "step": 6483 + }, + { + "epoch": 0.83, + "grad_norm": 1.5341449975967407, + "learning_rate": 7.327385954053023e-07, + "loss": 0.6461, + "step": 6484 + }, + { + "epoch": 0.83, + "grad_norm": 1.2405881881713867, + "learning_rate": 7.316575184170577e-07, + "loss": 0.6008, + "step": 6485 + }, + { + "epoch": 0.83, + "grad_norm": 1.6158428192138672, + "learning_rate": 7.30577176564048e-07, + "loss": 0.6164, + "step": 6486 + }, + { + "epoch": 0.83, + "grad_norm": 1.4766799211502075, + "learning_rate": 7.294975700323404e-07, + "loss": 0.5562, + "step": 6487 + }, + { + "epoch": 0.83, + "grad_norm": 1.7121096849441528, + "learning_rate": 7.284186990078767e-07, + "loss": 0.5626, + "step": 6488 + }, + { + "epoch": 0.83, + "grad_norm": 0.9860489964485168, + "learning_rate": 7.273405636764675e-07, + "loss": 0.6356, + "step": 6489 + }, + { + "epoch": 0.83, + "grad_norm": 1.6255037784576416, + "learning_rate": 7.262631642238011e-07, + "loss": 0.5731, + "step": 6490 + }, + { + "epoch": 0.83, + "grad_norm": 1.4038243293762207, + "learning_rate": 7.25186500835438e-07, + "loss": 0.6349, + "step": 6491 + }, + { + "epoch": 0.83, + "grad_norm": 1.1845976114273071, + "learning_rate": 7.241105736968124e-07, + "loss": 0.5589, + "step": 6492 + }, + { + "epoch": 0.83, + "grad_norm": 1.3688479661941528, + "learning_rate": 7.230353829932285e-07, + "loss": 0.6446, + "step": 6493 + }, + { + "epoch": 0.83, + "grad_norm": 1.30573570728302, + "learning_rate": 7.219609289098672e-07, + "loss": 0.6097, + "step": 6494 + }, + { + "epoch": 0.83, + "grad_norm": 1.4633636474609375, + "learning_rate": 7.208872116317822e-07, + "loss": 0.633, + "step": 6495 + }, + { + "epoch": 0.83, + "grad_norm": 1.5460313558578491, + "learning_rate": 7.198142313438983e-07, + "loss": 0.5651, + "step": 6496 + }, + { + "epoch": 0.83, + "grad_norm": 1.4103299379348755, + "learning_rate": 7.187419882310148e-07, + "loss": 0.5557, + "step": 6497 + }, + { + "epoch": 0.83, + "grad_norm": 1.33799147605896, + "learning_rate": 7.176704824778052e-07, + "loss": 0.5859, + "step": 6498 + }, + { + "epoch": 0.83, + "grad_norm": 1.6135934591293335, + "learning_rate": 7.165997142688124e-07, + "loss": 0.5929, + "step": 6499 + }, + { + "epoch": 0.83, + "grad_norm": 1.3209842443466187, + "learning_rate": 7.155296837884557e-07, + "loss": 0.6129, + "step": 6500 + }, + { + "epoch": 0.83, + "grad_norm": 1.2033591270446777, + "learning_rate": 7.144603912210257e-07, + "loss": 0.6195, + "step": 6501 + }, + { + "epoch": 0.83, + "grad_norm": 1.1654304265975952, + "learning_rate": 7.133918367506876e-07, + "loss": 0.5069, + "step": 6502 + }, + { + "epoch": 0.83, + "grad_norm": 1.3715147972106934, + "learning_rate": 7.123240205614756e-07, + "loss": 0.5825, + "step": 6503 + }, + { + "epoch": 0.83, + "grad_norm": 1.4160212278366089, + "learning_rate": 7.112569428373012e-07, + "loss": 0.5936, + "step": 6504 + }, + { + "epoch": 0.83, + "grad_norm": 1.3100764751434326, + "learning_rate": 7.101906037619466e-07, + "loss": 0.5829, + "step": 6505 + }, + { + "epoch": 0.83, + "grad_norm": 1.678920865058899, + "learning_rate": 7.091250035190678e-07, + "loss": 0.6412, + "step": 6506 + }, + { + "epoch": 0.83, + "grad_norm": 1.4690093994140625, + "learning_rate": 7.080601422921901e-07, + "loss": 0.5738, + "step": 6507 + }, + { + "epoch": 0.83, + "grad_norm": 1.3475110530853271, + "learning_rate": 7.069960202647169e-07, + "loss": 0.5587, + "step": 6508 + }, + { + "epoch": 0.83, + "grad_norm": 1.3803815841674805, + "learning_rate": 7.059326376199199e-07, + "loss": 0.6277, + "step": 6509 + }, + { + "epoch": 0.83, + "grad_norm": 1.3090627193450928, + "learning_rate": 7.048699945409477e-07, + "loss": 0.5846, + "step": 6510 + }, + { + "epoch": 0.83, + "grad_norm": 1.225256323814392, + "learning_rate": 7.038080912108152e-07, + "loss": 0.5646, + "step": 6511 + }, + { + "epoch": 0.83, + "grad_norm": 1.3957878351211548, + "learning_rate": 7.027469278124155e-07, + "loss": 0.5509, + "step": 6512 + }, + { + "epoch": 0.83, + "grad_norm": 1.2957148551940918, + "learning_rate": 7.016865045285143e-07, + "loss": 0.6144, + "step": 6513 + }, + { + "epoch": 0.83, + "grad_norm": 1.1870754957199097, + "learning_rate": 7.00626821541745e-07, + "loss": 0.6008, + "step": 6514 + }, + { + "epoch": 0.83, + "grad_norm": 1.483576774597168, + "learning_rate": 6.995678790346178e-07, + "loss": 0.6274, + "step": 6515 + }, + { + "epoch": 0.83, + "grad_norm": 1.2268551588058472, + "learning_rate": 6.985096771895139e-07, + "loss": 0.5665, + "step": 6516 + }, + { + "epoch": 0.83, + "grad_norm": 1.424389362335205, + "learning_rate": 6.974522161886882e-07, + "loss": 0.5598, + "step": 6517 + }, + { + "epoch": 0.84, + "grad_norm": 1.3461567163467407, + "learning_rate": 6.963954962142644e-07, + "loss": 0.6131, + "step": 6518 + }, + { + "epoch": 0.84, + "grad_norm": 1.4819175004959106, + "learning_rate": 6.953395174482425e-07, + "loss": 0.6464, + "step": 6519 + }, + { + "epoch": 0.84, + "grad_norm": 1.284936547279358, + "learning_rate": 6.94284280072493e-07, + "loss": 0.6178, + "step": 6520 + }, + { + "epoch": 0.84, + "grad_norm": 1.185971736907959, + "learning_rate": 6.932297842687607e-07, + "loss": 0.6414, + "step": 6521 + }, + { + "epoch": 0.84, + "grad_norm": 1.4137840270996094, + "learning_rate": 6.921760302186587e-07, + "loss": 0.6293, + "step": 6522 + }, + { + "epoch": 0.84, + "grad_norm": 1.347339391708374, + "learning_rate": 6.911230181036755e-07, + "loss": 0.5987, + "step": 6523 + }, + { + "epoch": 0.84, + "grad_norm": 1.255189299583435, + "learning_rate": 6.90070748105171e-07, + "loss": 0.6005, + "step": 6524 + }, + { + "epoch": 0.84, + "grad_norm": 1.5423235893249512, + "learning_rate": 6.890192204043789e-07, + "loss": 0.6264, + "step": 6525 + }, + { + "epoch": 0.84, + "grad_norm": 1.6151106357574463, + "learning_rate": 6.879684351824012e-07, + "loss": 0.7098, + "step": 6526 + }, + { + "epoch": 0.84, + "grad_norm": 1.403106451034546, + "learning_rate": 6.869183926202149e-07, + "loss": 0.5491, + "step": 6527 + }, + { + "epoch": 0.84, + "grad_norm": 1.432671308517456, + "learning_rate": 6.858690928986689e-07, + "loss": 0.6377, + "step": 6528 + }, + { + "epoch": 0.84, + "grad_norm": 1.3803349733352661, + "learning_rate": 6.84820536198485e-07, + "loss": 0.5884, + "step": 6529 + }, + { + "epoch": 0.84, + "grad_norm": 1.3720000982284546, + "learning_rate": 6.837727227002522e-07, + "loss": 0.6173, + "step": 6530 + }, + { + "epoch": 0.84, + "grad_norm": 1.2312438488006592, + "learning_rate": 6.827256525844384e-07, + "loss": 0.5887, + "step": 6531 + }, + { + "epoch": 0.84, + "grad_norm": 1.3636536598205566, + "learning_rate": 6.816793260313798e-07, + "loss": 0.5294, + "step": 6532 + }, + { + "epoch": 0.84, + "grad_norm": 1.2737432718276978, + "learning_rate": 6.806337432212834e-07, + "loss": 0.5848, + "step": 6533 + }, + { + "epoch": 0.84, + "grad_norm": 1.3652080297470093, + "learning_rate": 6.795889043342302e-07, + "loss": 0.6028, + "step": 6534 + }, + { + "epoch": 0.84, + "grad_norm": 1.309290885925293, + "learning_rate": 6.785448095501728e-07, + "loss": 0.5539, + "step": 6535 + }, + { + "epoch": 0.84, + "grad_norm": 1.4194649457931519, + "learning_rate": 6.775014590489359e-07, + "loss": 0.5934, + "step": 6536 + }, + { + "epoch": 0.84, + "grad_norm": 1.6991386413574219, + "learning_rate": 6.76458853010214e-07, + "loss": 0.6732, + "step": 6537 + }, + { + "epoch": 0.84, + "grad_norm": 1.4001078605651855, + "learning_rate": 6.75416991613575e-07, + "loss": 0.6728, + "step": 6538 + }, + { + "epoch": 0.84, + "grad_norm": 1.0708200931549072, + "learning_rate": 6.743758750384588e-07, + "loss": 0.6719, + "step": 6539 + }, + { + "epoch": 0.84, + "grad_norm": 2.4985439777374268, + "learning_rate": 6.733355034641776e-07, + "loss": 0.5891, + "step": 6540 + }, + { + "epoch": 0.84, + "grad_norm": 1.2028065919876099, + "learning_rate": 6.722958770699123e-07, + "loss": 0.6056, + "step": 6541 + }, + { + "epoch": 0.84, + "grad_norm": 1.2086281776428223, + "learning_rate": 6.712569960347182e-07, + "loss": 0.5668, + "step": 6542 + }, + { + "epoch": 0.84, + "grad_norm": 1.2189396619796753, + "learning_rate": 6.702188605375226e-07, + "loss": 0.7119, + "step": 6543 + }, + { + "epoch": 0.84, + "grad_norm": 1.2520568370819092, + "learning_rate": 6.691814707571209e-07, + "loss": 0.5847, + "step": 6544 + }, + { + "epoch": 0.84, + "grad_norm": 1.4773744344711304, + "learning_rate": 6.681448268721841e-07, + "loss": 0.5806, + "step": 6545 + }, + { + "epoch": 0.84, + "grad_norm": 1.519181728363037, + "learning_rate": 6.671089290612526e-07, + "loss": 0.5644, + "step": 6546 + }, + { + "epoch": 0.84, + "grad_norm": 1.489691972732544, + "learning_rate": 6.660737775027381e-07, + "loss": 0.6373, + "step": 6547 + }, + { + "epoch": 0.84, + "grad_norm": 1.2815091609954834, + "learning_rate": 6.65039372374926e-07, + "loss": 0.5611, + "step": 6548 + }, + { + "epoch": 0.84, + "grad_norm": 1.2832820415496826, + "learning_rate": 6.640057138559702e-07, + "loss": 0.6027, + "step": 6549 + }, + { + "epoch": 0.84, + "grad_norm": 1.3316545486450195, + "learning_rate": 6.629728021238991e-07, + "loss": 0.5918, + "step": 6550 + }, + { + "epoch": 0.84, + "grad_norm": 1.3692222833633423, + "learning_rate": 6.619406373566079e-07, + "loss": 0.6008, + "step": 6551 + }, + { + "epoch": 0.84, + "grad_norm": 1.561578392982483, + "learning_rate": 6.609092197318678e-07, + "loss": 0.5702, + "step": 6552 + }, + { + "epoch": 0.84, + "grad_norm": 1.8879069089889526, + "learning_rate": 6.598785494273197e-07, + "loss": 0.5556, + "step": 6553 + }, + { + "epoch": 0.84, + "grad_norm": 1.5902771949768066, + "learning_rate": 6.588486266204758e-07, + "loss": 0.6338, + "step": 6554 + }, + { + "epoch": 0.84, + "grad_norm": 1.5381138324737549, + "learning_rate": 6.578194514887176e-07, + "loss": 0.5805, + "step": 6555 + }, + { + "epoch": 0.84, + "grad_norm": 1.417212963104248, + "learning_rate": 6.567910242093012e-07, + "loss": 0.5621, + "step": 6556 + }, + { + "epoch": 0.84, + "grad_norm": 1.422182321548462, + "learning_rate": 6.557633449593515e-07, + "loss": 0.516, + "step": 6557 + }, + { + "epoch": 0.84, + "grad_norm": 1.2522732019424438, + "learning_rate": 6.547364139158674e-07, + "loss": 0.6348, + "step": 6558 + }, + { + "epoch": 0.84, + "grad_norm": 1.4380911588668823, + "learning_rate": 6.537102312557137e-07, + "loss": 0.6165, + "step": 6559 + }, + { + "epoch": 0.84, + "grad_norm": 1.3114643096923828, + "learning_rate": 6.52684797155631e-07, + "loss": 0.5736, + "step": 6560 + }, + { + "epoch": 0.84, + "grad_norm": 1.367963433265686, + "learning_rate": 6.516601117922295e-07, + "loss": 0.5807, + "step": 6561 + }, + { + "epoch": 0.84, + "grad_norm": 1.5235767364501953, + "learning_rate": 6.506361753419916e-07, + "loss": 0.6142, + "step": 6562 + }, + { + "epoch": 0.84, + "grad_norm": 2.021899700164795, + "learning_rate": 6.496129879812673e-07, + "loss": 0.5628, + "step": 6563 + }, + { + "epoch": 0.84, + "grad_norm": 1.5203709602355957, + "learning_rate": 6.485905498862799e-07, + "loss": 0.6086, + "step": 6564 + }, + { + "epoch": 0.84, + "grad_norm": 1.330996036529541, + "learning_rate": 6.475688612331265e-07, + "loss": 0.5845, + "step": 6565 + }, + { + "epoch": 0.84, + "grad_norm": 1.3727091550827026, + "learning_rate": 6.465479221977694e-07, + "loss": 0.596, + "step": 6566 + }, + { + "epoch": 0.84, + "grad_norm": 1.5101768970489502, + "learning_rate": 6.455277329560456e-07, + "loss": 0.5664, + "step": 6567 + }, + { + "epoch": 0.84, + "grad_norm": 1.3237074613571167, + "learning_rate": 6.445082936836616e-07, + "loss": 0.6286, + "step": 6568 + }, + { + "epoch": 0.84, + "grad_norm": 1.626060962677002, + "learning_rate": 6.434896045561967e-07, + "loss": 0.5562, + "step": 6569 + }, + { + "epoch": 0.84, + "grad_norm": 1.2659837007522583, + "learning_rate": 6.424716657490965e-07, + "loss": 0.5489, + "step": 6570 + }, + { + "epoch": 0.84, + "grad_norm": 1.7719166278839111, + "learning_rate": 6.414544774376819e-07, + "loss": 0.6006, + "step": 6571 + }, + { + "epoch": 0.84, + "grad_norm": 1.414229393005371, + "learning_rate": 6.404380397971432e-07, + "loss": 0.5606, + "step": 6572 + }, + { + "epoch": 0.84, + "grad_norm": 1.48387610912323, + "learning_rate": 6.394223530025418e-07, + "loss": 0.5362, + "step": 6573 + }, + { + "epoch": 0.84, + "grad_norm": 1.4343132972717285, + "learning_rate": 6.384074172288068e-07, + "loss": 0.6049, + "step": 6574 + }, + { + "epoch": 0.84, + "grad_norm": 1.396812081336975, + "learning_rate": 6.373932326507415e-07, + "loss": 0.5969, + "step": 6575 + }, + { + "epoch": 0.84, + "grad_norm": 1.5124541521072388, + "learning_rate": 6.363797994430182e-07, + "loss": 0.6075, + "step": 6576 + }, + { + "epoch": 0.84, + "grad_norm": 1.2993794679641724, + "learning_rate": 6.353671177801824e-07, + "loss": 0.6173, + "step": 6577 + }, + { + "epoch": 0.84, + "grad_norm": 1.5706703662872314, + "learning_rate": 6.343551878366444e-07, + "loss": 0.6263, + "step": 6578 + }, + { + "epoch": 0.84, + "grad_norm": 1.3450409173965454, + "learning_rate": 6.333440097866905e-07, + "loss": 0.6767, + "step": 6579 + }, + { + "epoch": 0.84, + "grad_norm": 1.1720281839370728, + "learning_rate": 6.323335838044753e-07, + "loss": 0.5835, + "step": 6580 + }, + { + "epoch": 0.84, + "grad_norm": 1.2327227592468262, + "learning_rate": 6.31323910064024e-07, + "loss": 0.7767, + "step": 6581 + }, + { + "epoch": 0.84, + "grad_norm": 1.5149428844451904, + "learning_rate": 6.303149887392329e-07, + "loss": 0.6144, + "step": 6582 + }, + { + "epoch": 0.84, + "grad_norm": 1.06614089012146, + "learning_rate": 6.293068200038677e-07, + "loss": 0.6909, + "step": 6583 + }, + { + "epoch": 0.84, + "grad_norm": 1.6291388273239136, + "learning_rate": 6.28299404031566e-07, + "loss": 0.6292, + "step": 6584 + }, + { + "epoch": 0.84, + "grad_norm": 1.6113803386688232, + "learning_rate": 6.272927409958323e-07, + "loss": 0.6151, + "step": 6585 + }, + { + "epoch": 0.84, + "grad_norm": 1.4304330348968506, + "learning_rate": 6.262868310700459e-07, + "loss": 0.6134, + "step": 6586 + }, + { + "epoch": 0.84, + "grad_norm": 1.3153834342956543, + "learning_rate": 6.252816744274542e-07, + "loss": 0.6329, + "step": 6587 + }, + { + "epoch": 0.84, + "grad_norm": 1.266677737236023, + "learning_rate": 6.242772712411754e-07, + "loss": 0.5337, + "step": 6588 + }, + { + "epoch": 0.84, + "grad_norm": 0.8882852792739868, + "learning_rate": 6.232736216841956e-07, + "loss": 0.6046, + "step": 6589 + }, + { + "epoch": 0.84, + "grad_norm": 1.5751484632492065, + "learning_rate": 6.222707259293742e-07, + "loss": 0.59, + "step": 6590 + }, + { + "epoch": 0.84, + "grad_norm": 1.3150429725646973, + "learning_rate": 6.212685841494392e-07, + "loss": 0.6032, + "step": 6591 + }, + { + "epoch": 0.84, + "grad_norm": 1.3497982025146484, + "learning_rate": 6.202671965169909e-07, + "loss": 0.6491, + "step": 6592 + }, + { + "epoch": 0.84, + "grad_norm": 1.6363470554351807, + "learning_rate": 6.192665632044959e-07, + "loss": 0.6138, + "step": 6593 + }, + { + "epoch": 0.84, + "grad_norm": 1.4757647514343262, + "learning_rate": 6.182666843842933e-07, + "loss": 0.6082, + "step": 6594 + }, + { + "epoch": 0.84, + "grad_norm": 1.5103785991668701, + "learning_rate": 6.172675602285933e-07, + "loss": 0.6074, + "step": 6595 + }, + { + "epoch": 0.85, + "grad_norm": 1.3814870119094849, + "learning_rate": 6.162691909094726e-07, + "loss": 0.6029, + "step": 6596 + }, + { + "epoch": 0.85, + "grad_norm": 1.476153016090393, + "learning_rate": 6.152715765988815e-07, + "loss": 0.6853, + "step": 6597 + }, + { + "epoch": 0.85, + "grad_norm": 1.8093366622924805, + "learning_rate": 6.142747174686381e-07, + "loss": 0.5474, + "step": 6598 + }, + { + "epoch": 0.85, + "grad_norm": 1.3409085273742676, + "learning_rate": 6.132786136904312e-07, + "loss": 0.6649, + "step": 6599 + }, + { + "epoch": 0.85, + "grad_norm": 2.276571750640869, + "learning_rate": 6.122832654358196e-07, + "loss": 0.5591, + "step": 6600 + }, + { + "epoch": 0.85, + "grad_norm": 1.5459884405136108, + "learning_rate": 6.112886728762324e-07, + "loss": 0.6567, + "step": 6601 + }, + { + "epoch": 0.85, + "grad_norm": 1.4320536851882935, + "learning_rate": 6.10294836182968e-07, + "loss": 0.6105, + "step": 6602 + }, + { + "epoch": 0.85, + "grad_norm": 0.9960110783576965, + "learning_rate": 6.093017555271935e-07, + "loss": 0.6775, + "step": 6603 + }, + { + "epoch": 0.85, + "grad_norm": 1.5592219829559326, + "learning_rate": 6.08309431079947e-07, + "loss": 0.6259, + "step": 6604 + }, + { + "epoch": 0.85, + "grad_norm": 1.4093371629714966, + "learning_rate": 6.073178630121363e-07, + "loss": 0.6485, + "step": 6605 + }, + { + "epoch": 0.85, + "grad_norm": 1.1359326839447021, + "learning_rate": 6.063270514945402e-07, + "loss": 0.5621, + "step": 6606 + }, + { + "epoch": 0.85, + "grad_norm": 1.2226295471191406, + "learning_rate": 6.05336996697804e-07, + "loss": 0.5737, + "step": 6607 + }, + { + "epoch": 0.85, + "grad_norm": 1.3089041709899902, + "learning_rate": 6.043476987924452e-07, + "loss": 0.5701, + "step": 6608 + }, + { + "epoch": 0.85, + "grad_norm": 1.5044931173324585, + "learning_rate": 6.033591579488501e-07, + "loss": 0.7168, + "step": 6609 + }, + { + "epoch": 0.85, + "grad_norm": 1.2217152118682861, + "learning_rate": 6.023713743372761e-07, + "loss": 0.5639, + "step": 6610 + }, + { + "epoch": 0.85, + "grad_norm": 1.4304804801940918, + "learning_rate": 6.01384348127847e-07, + "loss": 0.6051, + "step": 6611 + }, + { + "epoch": 0.85, + "grad_norm": 1.248270869255066, + "learning_rate": 6.003980794905584e-07, + "loss": 0.5676, + "step": 6612 + }, + { + "epoch": 0.85, + "grad_norm": 1.317533016204834, + "learning_rate": 5.994125685952757e-07, + "loss": 0.5705, + "step": 6613 + }, + { + "epoch": 0.85, + "grad_norm": 1.586898922920227, + "learning_rate": 5.984278156117335e-07, + "loss": 0.6188, + "step": 6614 + }, + { + "epoch": 0.85, + "grad_norm": 1.4527963399887085, + "learning_rate": 5.974438207095328e-07, + "loss": 0.5467, + "step": 6615 + }, + { + "epoch": 0.85, + "grad_norm": 1.3087095022201538, + "learning_rate": 5.964605840581494e-07, + "loss": 0.5782, + "step": 6616 + }, + { + "epoch": 0.85, + "grad_norm": 1.2765275239944458, + "learning_rate": 5.954781058269265e-07, + "loss": 0.5261, + "step": 6617 + }, + { + "epoch": 0.85, + "grad_norm": 1.2288613319396973, + "learning_rate": 5.944963861850738e-07, + "loss": 0.6082, + "step": 6618 + }, + { + "epoch": 0.85, + "grad_norm": 1.3720436096191406, + "learning_rate": 5.935154253016729e-07, + "loss": 0.6148, + "step": 6619 + }, + { + "epoch": 0.85, + "grad_norm": 1.081778883934021, + "learning_rate": 5.925352233456749e-07, + "loss": 0.5368, + "step": 6620 + }, + { + "epoch": 0.85, + "grad_norm": 1.3894524574279785, + "learning_rate": 5.915557804859013e-07, + "loss": 0.5739, + "step": 6621 + }, + { + "epoch": 0.85, + "grad_norm": 1.4773305654525757, + "learning_rate": 5.905770968910379e-07, + "loss": 0.6667, + "step": 6622 + }, + { + "epoch": 0.85, + "grad_norm": 1.2038147449493408, + "learning_rate": 5.895991727296447e-07, + "loss": 0.5568, + "step": 6623 + }, + { + "epoch": 0.85, + "grad_norm": 1.198391318321228, + "learning_rate": 5.886220081701494e-07, + "loss": 0.6375, + "step": 6624 + }, + { + "epoch": 0.85, + "grad_norm": 1.1705589294433594, + "learning_rate": 5.876456033808498e-07, + "loss": 0.5212, + "step": 6625 + }, + { + "epoch": 0.85, + "grad_norm": 1.425979733467102, + "learning_rate": 5.86669958529909e-07, + "loss": 0.6197, + "step": 6626 + }, + { + "epoch": 0.85, + "grad_norm": 1.5084619522094727, + "learning_rate": 5.85695073785364e-07, + "loss": 0.6776, + "step": 6627 + }, + { + "epoch": 0.85, + "grad_norm": 1.321138858795166, + "learning_rate": 5.847209493151185e-07, + "loss": 0.6305, + "step": 6628 + }, + { + "epoch": 0.85, + "grad_norm": 1.3939682245254517, + "learning_rate": 5.837475852869462e-07, + "loss": 0.6287, + "step": 6629 + }, + { + "epoch": 0.85, + "grad_norm": 6.342435836791992, + "learning_rate": 5.82774981868488e-07, + "loss": 0.5973, + "step": 6630 + }, + { + "epoch": 0.85, + "grad_norm": 1.4819163084030151, + "learning_rate": 5.818031392272555e-07, + "loss": 0.6643, + "step": 6631 + }, + { + "epoch": 0.85, + "grad_norm": 1.5554163455963135, + "learning_rate": 5.808320575306292e-07, + "loss": 0.6145, + "step": 6632 + }, + { + "epoch": 0.85, + "grad_norm": 1.2572414875030518, + "learning_rate": 5.798617369458581e-07, + "loss": 0.6317, + "step": 6633 + }, + { + "epoch": 0.85, + "grad_norm": 1.3168387413024902, + "learning_rate": 5.788921776400597e-07, + "loss": 0.6413, + "step": 6634 + }, + { + "epoch": 0.85, + "grad_norm": 1.401464819908142, + "learning_rate": 5.77923379780222e-07, + "loss": 0.6708, + "step": 6635 + }, + { + "epoch": 0.85, + "grad_norm": 1.3466784954071045, + "learning_rate": 5.769553435332009e-07, + "loss": 0.5765, + "step": 6636 + }, + { + "epoch": 0.85, + "grad_norm": 1.2899616956710815, + "learning_rate": 5.759880690657188e-07, + "loss": 0.5885, + "step": 6637 + }, + { + "epoch": 0.85, + "grad_norm": 1.3985425233840942, + "learning_rate": 5.750215565443707e-07, + "loss": 0.5156, + "step": 6638 + }, + { + "epoch": 0.85, + "grad_norm": 1.468558669090271, + "learning_rate": 5.740558061356183e-07, + "loss": 0.6191, + "step": 6639 + }, + { + "epoch": 0.85, + "grad_norm": 1.5268968343734741, + "learning_rate": 5.730908180057937e-07, + "loss": 0.5815, + "step": 6640 + }, + { + "epoch": 0.85, + "grad_norm": 2.1540591716766357, + "learning_rate": 5.721265923210944e-07, + "loss": 0.578, + "step": 6641 + }, + { + "epoch": 0.85, + "grad_norm": 1.5741490125656128, + "learning_rate": 5.711631292475894e-07, + "loss": 0.5626, + "step": 6642 + }, + { + "epoch": 0.85, + "grad_norm": 2.359459638595581, + "learning_rate": 5.702004289512175e-07, + "loss": 0.6464, + "step": 6643 + }, + { + "epoch": 0.85, + "grad_norm": 1.2629036903381348, + "learning_rate": 5.692384915977811e-07, + "loss": 0.6634, + "step": 6644 + }, + { + "epoch": 0.85, + "grad_norm": 1.1385648250579834, + "learning_rate": 5.682773173529565e-07, + "loss": 0.6298, + "step": 6645 + }, + { + "epoch": 0.85, + "grad_norm": 1.4421610832214355, + "learning_rate": 5.673169063822853e-07, + "loss": 0.5755, + "step": 6646 + }, + { + "epoch": 0.85, + "grad_norm": 1.2397493124008179, + "learning_rate": 5.663572588511806e-07, + "loss": 0.5909, + "step": 6647 + }, + { + "epoch": 0.85, + "grad_norm": 1.226677656173706, + "learning_rate": 5.653983749249198e-07, + "loss": 0.6041, + "step": 6648 + }, + { + "epoch": 0.85, + "grad_norm": 1.2758342027664185, + "learning_rate": 5.644402547686518e-07, + "loss": 0.6877, + "step": 6649 + }, + { + "epoch": 0.85, + "grad_norm": 1.6766910552978516, + "learning_rate": 5.63482898547395e-07, + "loss": 0.6041, + "step": 6650 + }, + { + "epoch": 0.85, + "grad_norm": 1.5834779739379883, + "learning_rate": 5.625263064260328e-07, + "loss": 0.5752, + "step": 6651 + }, + { + "epoch": 0.85, + "grad_norm": 2.2436914443969727, + "learning_rate": 5.615704785693193e-07, + "loss": 0.6544, + "step": 6652 + }, + { + "epoch": 0.85, + "grad_norm": 1.5408368110656738, + "learning_rate": 5.606154151418763e-07, + "loss": 0.6088, + "step": 6653 + }, + { + "epoch": 0.85, + "grad_norm": 1.2919232845306396, + "learning_rate": 5.596611163081949e-07, + "loss": 0.5976, + "step": 6654 + }, + { + "epoch": 0.85, + "grad_norm": 1.246882438659668, + "learning_rate": 5.587075822326326e-07, + "loss": 0.5889, + "step": 6655 + }, + { + "epoch": 0.85, + "grad_norm": 1.1017459630966187, + "learning_rate": 5.577548130794164e-07, + "loss": 0.5616, + "step": 6656 + }, + { + "epoch": 0.85, + "grad_norm": 1.254428505897522, + "learning_rate": 5.568028090126415e-07, + "loss": 0.5466, + "step": 6657 + }, + { + "epoch": 0.85, + "grad_norm": 1.4765057563781738, + "learning_rate": 5.558515701962725e-07, + "loss": 0.609, + "step": 6658 + }, + { + "epoch": 0.85, + "grad_norm": 1.194210171699524, + "learning_rate": 5.549010967941387e-07, + "loss": 0.5604, + "step": 6659 + }, + { + "epoch": 0.85, + "grad_norm": 1.3843317031860352, + "learning_rate": 5.539513889699411e-07, + "loss": 0.5851, + "step": 6660 + }, + { + "epoch": 0.85, + "grad_norm": 1.2867571115493774, + "learning_rate": 5.530024468872474e-07, + "loss": 0.6045, + "step": 6661 + }, + { + "epoch": 0.85, + "grad_norm": 1.4414231777191162, + "learning_rate": 5.520542707094945e-07, + "loss": 0.5784, + "step": 6662 + }, + { + "epoch": 0.85, + "grad_norm": 1.4258451461791992, + "learning_rate": 5.511068605999848e-07, + "loss": 0.6573, + "step": 6663 + }, + { + "epoch": 0.85, + "grad_norm": 1.4315848350524902, + "learning_rate": 5.501602167218912e-07, + "loss": 0.5564, + "step": 6664 + }, + { + "epoch": 0.85, + "grad_norm": 1.1932504177093506, + "learning_rate": 5.492143392382537e-07, + "loss": 0.5155, + "step": 6665 + }, + { + "epoch": 0.85, + "grad_norm": 1.22544527053833, + "learning_rate": 5.482692283119817e-07, + "loss": 0.5858, + "step": 6666 + }, + { + "epoch": 0.85, + "grad_norm": 1.4700120687484741, + "learning_rate": 5.473248841058487e-07, + "loss": 0.6701, + "step": 6667 + }, + { + "epoch": 0.85, + "grad_norm": 1.4346530437469482, + "learning_rate": 5.463813067825008e-07, + "loss": 0.647, + "step": 6668 + }, + { + "epoch": 0.85, + "grad_norm": 2.5285699367523193, + "learning_rate": 5.454384965044512e-07, + "loss": 0.5553, + "step": 6669 + }, + { + "epoch": 0.85, + "grad_norm": 5.045205593109131, + "learning_rate": 5.444964534340768e-07, + "loss": 0.5509, + "step": 6670 + }, + { + "epoch": 0.85, + "grad_norm": 1.4717965126037598, + "learning_rate": 5.435551777336273e-07, + "loss": 0.6316, + "step": 6671 + }, + { + "epoch": 0.85, + "grad_norm": 1.3239675760269165, + "learning_rate": 5.426146695652173e-07, + "loss": 0.5174, + "step": 6672 + }, + { + "epoch": 0.85, + "grad_norm": 1.4008818864822388, + "learning_rate": 5.416749290908324e-07, + "loss": 0.5849, + "step": 6673 + }, + { + "epoch": 0.86, + "grad_norm": 1.3032972812652588, + "learning_rate": 5.407359564723202e-07, + "loss": 0.6331, + "step": 6674 + }, + { + "epoch": 0.86, + "grad_norm": 1.2396972179412842, + "learning_rate": 5.397977518714026e-07, + "loss": 0.5312, + "step": 6675 + }, + { + "epoch": 0.86, + "grad_norm": 1.1935837268829346, + "learning_rate": 5.388603154496647e-07, + "loss": 0.5171, + "step": 6676 + }, + { + "epoch": 0.86, + "grad_norm": 1.5318385362625122, + "learning_rate": 5.379236473685623e-07, + "loss": 0.567, + "step": 6677 + }, + { + "epoch": 0.86, + "grad_norm": 1.2079846858978271, + "learning_rate": 5.369877477894154e-07, + "loss": 0.7553, + "step": 6678 + }, + { + "epoch": 0.86, + "grad_norm": 1.4537231922149658, + "learning_rate": 5.360526168734154e-07, + "loss": 0.556, + "step": 6679 + }, + { + "epoch": 0.86, + "grad_norm": 1.2296948432922363, + "learning_rate": 5.351182547816186e-07, + "loss": 0.5602, + "step": 6680 + }, + { + "epoch": 0.86, + "grad_norm": 1.3176974058151245, + "learning_rate": 5.341846616749513e-07, + "loss": 0.6225, + "step": 6681 + }, + { + "epoch": 0.86, + "grad_norm": 1.0236971378326416, + "learning_rate": 5.332518377142043e-07, + "loss": 0.6221, + "step": 6682 + }, + { + "epoch": 0.86, + "grad_norm": 1.3097838163375854, + "learning_rate": 5.32319783060038e-07, + "loss": 0.4918, + "step": 6683 + }, + { + "epoch": 0.86, + "grad_norm": 1.4100576639175415, + "learning_rate": 5.3138849787298e-07, + "loss": 0.6335, + "step": 6684 + }, + { + "epoch": 0.86, + "grad_norm": 1.343997836112976, + "learning_rate": 5.304579823134254e-07, + "loss": 0.5864, + "step": 6685 + }, + { + "epoch": 0.86, + "grad_norm": 1.2703300714492798, + "learning_rate": 5.295282365416365e-07, + "loss": 0.5433, + "step": 6686 + }, + { + "epoch": 0.86, + "grad_norm": 1.460349440574646, + "learning_rate": 5.285992607177432e-07, + "loss": 0.6216, + "step": 6687 + }, + { + "epoch": 0.86, + "grad_norm": 1.8459396362304688, + "learning_rate": 5.276710550017433e-07, + "loss": 0.6032, + "step": 6688 + }, + { + "epoch": 0.86, + "grad_norm": 1.2343941926956177, + "learning_rate": 5.267436195535003e-07, + "loss": 0.7502, + "step": 6689 + }, + { + "epoch": 0.86, + "grad_norm": 1.4660110473632812, + "learning_rate": 5.258169545327462e-07, + "loss": 0.561, + "step": 6690 + }, + { + "epoch": 0.86, + "grad_norm": 1.122137188911438, + "learning_rate": 5.248910600990814e-07, + "loss": 0.5409, + "step": 6691 + }, + { + "epoch": 0.86, + "grad_norm": 1.3979350328445435, + "learning_rate": 5.239659364119703e-07, + "loss": 0.6101, + "step": 6692 + }, + { + "epoch": 0.86, + "grad_norm": 1.6155316829681396, + "learning_rate": 5.230415836307485e-07, + "loss": 0.6467, + "step": 6693 + }, + { + "epoch": 0.86, + "grad_norm": 1.6388894319534302, + "learning_rate": 5.22118001914616e-07, + "loss": 0.6412, + "step": 6694 + }, + { + "epoch": 0.86, + "grad_norm": 1.4391796588897705, + "learning_rate": 5.21195191422642e-07, + "loss": 0.5887, + "step": 6695 + }, + { + "epoch": 0.86, + "grad_norm": 1.4248244762420654, + "learning_rate": 5.202731523137605e-07, + "loss": 0.6395, + "step": 6696 + }, + { + "epoch": 0.86, + "grad_norm": 1.8664724826812744, + "learning_rate": 5.193518847467749e-07, + "loss": 0.5536, + "step": 6697 + }, + { + "epoch": 0.86, + "grad_norm": 1.5709271430969238, + "learning_rate": 5.184313888803544e-07, + "loss": 0.6439, + "step": 6698 + }, + { + "epoch": 0.86, + "grad_norm": 1.2865567207336426, + "learning_rate": 5.175116648730366e-07, + "loss": 0.6003, + "step": 6699 + }, + { + "epoch": 0.86, + "grad_norm": 1.3031384944915771, + "learning_rate": 5.165927128832238e-07, + "loss": 0.6648, + "step": 6700 + }, + { + "epoch": 0.86, + "grad_norm": 1.2907569408416748, + "learning_rate": 5.156745330691871e-07, + "loss": 0.7093, + "step": 6701 + }, + { + "epoch": 0.86, + "grad_norm": 1.391937017440796, + "learning_rate": 5.14757125589066e-07, + "loss": 0.6502, + "step": 6702 + }, + { + "epoch": 0.86, + "grad_norm": 1.4120694398880005, + "learning_rate": 5.138404906008631e-07, + "loss": 0.5444, + "step": 6703 + }, + { + "epoch": 0.86, + "grad_norm": 1.370375156402588, + "learning_rate": 5.129246282624511e-07, + "loss": 0.5462, + "step": 6704 + }, + { + "epoch": 0.86, + "grad_norm": 1.3214823007583618, + "learning_rate": 5.120095387315688e-07, + "loss": 0.6326, + "step": 6705 + }, + { + "epoch": 0.86, + "grad_norm": 1.4014390707015991, + "learning_rate": 5.110952221658228e-07, + "loss": 0.5686, + "step": 6706 + }, + { + "epoch": 0.86, + "grad_norm": 1.2138499021530151, + "learning_rate": 5.101816787226832e-07, + "loss": 0.5041, + "step": 6707 + }, + { + "epoch": 0.86, + "grad_norm": 1.6902697086334229, + "learning_rate": 5.092689085594904e-07, + "loss": 0.5416, + "step": 6708 + }, + { + "epoch": 0.86, + "grad_norm": 1.6033650636672974, + "learning_rate": 5.083569118334508e-07, + "loss": 0.6395, + "step": 6709 + }, + { + "epoch": 0.86, + "grad_norm": 1.4746758937835693, + "learning_rate": 5.074456887016382e-07, + "loss": 0.5835, + "step": 6710 + }, + { + "epoch": 0.86, + "grad_norm": 1.268338918685913, + "learning_rate": 5.065352393209899e-07, + "loss": 0.5185, + "step": 6711 + }, + { + "epoch": 0.86, + "grad_norm": 2.275320291519165, + "learning_rate": 5.056255638483137e-07, + "loss": 0.6441, + "step": 6712 + }, + { + "epoch": 0.86, + "grad_norm": 1.6120179891586304, + "learning_rate": 5.047166624402822e-07, + "loss": 0.582, + "step": 6713 + }, + { + "epoch": 0.86, + "grad_norm": 1.3864774703979492, + "learning_rate": 5.038085352534367e-07, + "loss": 0.542, + "step": 6714 + }, + { + "epoch": 0.86, + "grad_norm": 1.0285900831222534, + "learning_rate": 5.029011824441821e-07, + "loss": 0.5212, + "step": 6715 + }, + { + "epoch": 0.86, + "grad_norm": 1.4503127336502075, + "learning_rate": 5.019946041687911e-07, + "loss": 0.5739, + "step": 6716 + }, + { + "epoch": 0.86, + "grad_norm": 1.4805421829223633, + "learning_rate": 5.010888005834047e-07, + "loss": 0.558, + "step": 6717 + }, + { + "epoch": 0.86, + "grad_norm": 1.2787694931030273, + "learning_rate": 5.001837718440283e-07, + "loss": 0.5401, + "step": 6718 + }, + { + "epoch": 0.86, + "grad_norm": 1.7053654193878174, + "learning_rate": 4.992795181065347e-07, + "loss": 0.711, + "step": 6719 + }, + { + "epoch": 0.86, + "grad_norm": 1.1280864477157593, + "learning_rate": 4.98376039526664e-07, + "loss": 0.5233, + "step": 6720 + }, + { + "epoch": 0.86, + "grad_norm": 1.7056137323379517, + "learning_rate": 4.974733362600225e-07, + "loss": 0.6195, + "step": 6721 + }, + { + "epoch": 0.86, + "grad_norm": 1.1686879396438599, + "learning_rate": 4.965714084620804e-07, + "loss": 0.6126, + "step": 6722 + }, + { + "epoch": 0.86, + "grad_norm": 1.1632872819900513, + "learning_rate": 4.956702562881783e-07, + "loss": 0.5181, + "step": 6723 + }, + { + "epoch": 0.86, + "grad_norm": 1.4708881378173828, + "learning_rate": 4.947698798935196e-07, + "loss": 0.6074, + "step": 6724 + }, + { + "epoch": 0.86, + "grad_norm": 1.4640129804611206, + "learning_rate": 4.938702794331785e-07, + "loss": 0.5477, + "step": 6725 + }, + { + "epoch": 0.86, + "grad_norm": 1.3392497301101685, + "learning_rate": 4.929714550620901e-07, + "loss": 0.5741, + "step": 6726 + }, + { + "epoch": 0.86, + "grad_norm": 1.190356969833374, + "learning_rate": 4.920734069350597e-07, + "loss": 0.5884, + "step": 6727 + }, + { + "epoch": 0.86, + "grad_norm": 1.3186179399490356, + "learning_rate": 4.911761352067579e-07, + "loss": 0.6421, + "step": 6728 + }, + { + "epoch": 0.86, + "grad_norm": 1.3501472473144531, + "learning_rate": 4.902796400317228e-07, + "loss": 0.6491, + "step": 6729 + }, + { + "epoch": 0.86, + "grad_norm": 1.5455275774002075, + "learning_rate": 4.893839215643553e-07, + "loss": 0.6143, + "step": 6730 + }, + { + "epoch": 0.86, + "grad_norm": 1.5439045429229736, + "learning_rate": 4.884889799589254e-07, + "loss": 0.5792, + "step": 6731 + }, + { + "epoch": 0.86, + "grad_norm": 1.2626060247421265, + "learning_rate": 4.875948153695687e-07, + "loss": 0.5075, + "step": 6732 + }, + { + "epoch": 0.86, + "grad_norm": 1.0654497146606445, + "learning_rate": 4.86701427950288e-07, + "loss": 0.5002, + "step": 6733 + }, + { + "epoch": 0.86, + "grad_norm": 1.3448207378387451, + "learning_rate": 4.858088178549492e-07, + "loss": 0.5399, + "step": 6734 + }, + { + "epoch": 0.86, + "grad_norm": 1.4263006448745728, + "learning_rate": 4.849169852372864e-07, + "loss": 0.6053, + "step": 6735 + }, + { + "epoch": 0.86, + "grad_norm": 1.585645079612732, + "learning_rate": 4.84025930250902e-07, + "loss": 0.6796, + "step": 6736 + }, + { + "epoch": 0.86, + "grad_norm": 1.15276038646698, + "learning_rate": 4.831356530492598e-07, + "loss": 0.583, + "step": 6737 + }, + { + "epoch": 0.86, + "grad_norm": 1.6426639556884766, + "learning_rate": 4.822461537856927e-07, + "loss": 0.546, + "step": 6738 + }, + { + "epoch": 0.86, + "grad_norm": 1.0492527484893799, + "learning_rate": 4.813574326133985e-07, + "loss": 0.6961, + "step": 6739 + }, + { + "epoch": 0.86, + "grad_norm": 1.360402226448059, + "learning_rate": 4.804694896854434e-07, + "loss": 0.6052, + "step": 6740 + }, + { + "epoch": 0.86, + "grad_norm": 1.158557415008545, + "learning_rate": 4.795823251547544e-07, + "loss": 0.7179, + "step": 6741 + }, + { + "epoch": 0.86, + "grad_norm": 3.3589587211608887, + "learning_rate": 4.786959391741286e-07, + "loss": 0.5791, + "step": 6742 + }, + { + "epoch": 0.86, + "grad_norm": 1.7391221523284912, + "learning_rate": 4.778103318962296e-07, + "loss": 0.606, + "step": 6743 + }, + { + "epoch": 0.86, + "grad_norm": 1.5412747859954834, + "learning_rate": 4.769255034735831e-07, + "loss": 0.6217, + "step": 6744 + }, + { + "epoch": 0.86, + "grad_norm": 1.2919334173202515, + "learning_rate": 4.760414540585839e-07, + "loss": 0.6057, + "step": 6745 + }, + { + "epoch": 0.86, + "grad_norm": 1.3662687540054321, + "learning_rate": 4.7515818380349074e-07, + "loss": 0.7293, + "step": 6746 + }, + { + "epoch": 0.86, + "grad_norm": 1.417550802230835, + "learning_rate": 4.7427569286043086e-07, + "loss": 0.6307, + "step": 6747 + }, + { + "epoch": 0.86, + "grad_norm": 1.5828347206115723, + "learning_rate": 4.733939813813931e-07, + "loss": 0.5875, + "step": 6748 + }, + { + "epoch": 0.86, + "grad_norm": 1.5326488018035889, + "learning_rate": 4.725130495182356e-07, + "loss": 0.5814, + "step": 6749 + }, + { + "epoch": 0.86, + "grad_norm": 1.4294366836547852, + "learning_rate": 4.716328974226808e-07, + "loss": 0.5564, + "step": 6750 + }, + { + "epoch": 0.86, + "grad_norm": 1.306718111038208, + "learning_rate": 4.707535252463175e-07, + "loss": 0.5206, + "step": 6751 + }, + { + "epoch": 0.87, + "grad_norm": 1.2117685079574585, + "learning_rate": 4.6987493314059716e-07, + "loss": 0.516, + "step": 6752 + }, + { + "epoch": 0.87, + "grad_norm": 1.3194915056228638, + "learning_rate": 4.689971212568428e-07, + "loss": 0.6414, + "step": 6753 + }, + { + "epoch": 0.87, + "grad_norm": 1.456396222114563, + "learning_rate": 4.6812008974623845e-07, + "loss": 0.6447, + "step": 6754 + }, + { + "epoch": 0.87, + "grad_norm": 1.3069777488708496, + "learning_rate": 4.672438387598344e-07, + "loss": 0.6064, + "step": 6755 + }, + { + "epoch": 0.87, + "grad_norm": 1.897487759590149, + "learning_rate": 4.6636836844854706e-07, + "loss": 0.5244, + "step": 6756 + }, + { + "epoch": 0.87, + "grad_norm": 1.408294916152954, + "learning_rate": 4.6549367896315923e-07, + "loss": 0.5875, + "step": 6757 + }, + { + "epoch": 0.87, + "grad_norm": 1.1629550457000732, + "learning_rate": 4.646197704543187e-07, + "loss": 0.7416, + "step": 6758 + }, + { + "epoch": 0.87, + "grad_norm": 1.1940720081329346, + "learning_rate": 4.6374664307253625e-07, + "loss": 0.5178, + "step": 6759 + }, + { + "epoch": 0.87, + "grad_norm": 1.6622729301452637, + "learning_rate": 4.628742969681921e-07, + "loss": 0.6646, + "step": 6760 + }, + { + "epoch": 0.87, + "grad_norm": 2.3263518810272217, + "learning_rate": 4.6200273229152994e-07, + "loss": 0.5751, + "step": 6761 + }, + { + "epoch": 0.87, + "grad_norm": 1.7037662267684937, + "learning_rate": 4.611319491926597e-07, + "loss": 0.6177, + "step": 6762 + }, + { + "epoch": 0.87, + "grad_norm": 1.4251806735992432, + "learning_rate": 4.602619478215542e-07, + "loss": 0.7138, + "step": 6763 + }, + { + "epoch": 0.87, + "grad_norm": 1.6694972515106201, + "learning_rate": 4.593927283280547e-07, + "loss": 0.6646, + "step": 6764 + }, + { + "epoch": 0.87, + "grad_norm": 1.3548812866210938, + "learning_rate": 4.5852429086186646e-07, + "loss": 0.5704, + "step": 6765 + }, + { + "epoch": 0.87, + "grad_norm": 1.6709963083267212, + "learning_rate": 4.576566355725609e-07, + "loss": 0.7045, + "step": 6766 + }, + { + "epoch": 0.87, + "grad_norm": 1.4117552042007446, + "learning_rate": 4.5678976260957243e-07, + "loss": 0.5623, + "step": 6767 + }, + { + "epoch": 0.87, + "grad_norm": 1.4504401683807373, + "learning_rate": 4.5592367212220324e-07, + "loss": 0.6227, + "step": 6768 + }, + { + "epoch": 0.87, + "grad_norm": 1.266184687614441, + "learning_rate": 4.5505836425961956e-07, + "loss": 0.5448, + "step": 6769 + }, + { + "epoch": 0.87, + "grad_norm": 1.2798473834991455, + "learning_rate": 4.541938391708539e-07, + "loss": 0.5534, + "step": 6770 + }, + { + "epoch": 0.87, + "grad_norm": 1.5648812055587769, + "learning_rate": 4.533300970048016e-07, + "loss": 0.6557, + "step": 6771 + }, + { + "epoch": 0.87, + "grad_norm": 1.3745602369308472, + "learning_rate": 4.5246713791022633e-07, + "loss": 0.6003, + "step": 6772 + }, + { + "epoch": 0.87, + "grad_norm": 1.2130157947540283, + "learning_rate": 4.516049620357549e-07, + "loss": 0.7218, + "step": 6773 + }, + { + "epoch": 0.87, + "grad_norm": 1.1825891733169556, + "learning_rate": 4.507435695298784e-07, + "loss": 0.5922, + "step": 6774 + }, + { + "epoch": 0.87, + "grad_norm": 1.352432131767273, + "learning_rate": 4.4988296054095494e-07, + "loss": 0.5628, + "step": 6775 + }, + { + "epoch": 0.87, + "grad_norm": 1.6166142225265503, + "learning_rate": 4.4902313521720696e-07, + "loss": 0.634, + "step": 6776 + }, + { + "epoch": 0.87, + "grad_norm": 1.435806155204773, + "learning_rate": 4.4816409370672277e-07, + "loss": 0.5729, + "step": 6777 + }, + { + "epoch": 0.87, + "grad_norm": 1.5084896087646484, + "learning_rate": 4.473058361574534e-07, + "loss": 0.6539, + "step": 6778 + }, + { + "epoch": 0.87, + "grad_norm": 1.4006537199020386, + "learning_rate": 4.4644836271721617e-07, + "loss": 0.6189, + "step": 6779 + }, + { + "epoch": 0.87, + "grad_norm": 1.4999135732650757, + "learning_rate": 4.455916735336946e-07, + "loss": 0.6958, + "step": 6780 + }, + { + "epoch": 0.87, + "grad_norm": 1.1975650787353516, + "learning_rate": 4.4473576875443626e-07, + "loss": 0.6147, + "step": 6781 + }, + { + "epoch": 0.87, + "grad_norm": 1.477052927017212, + "learning_rate": 4.438806485268515e-07, + "loss": 0.5882, + "step": 6782 + }, + { + "epoch": 0.87, + "grad_norm": 1.4066824913024902, + "learning_rate": 4.430263129982182e-07, + "loss": 0.5717, + "step": 6783 + }, + { + "epoch": 0.87, + "grad_norm": 1.330893635749817, + "learning_rate": 4.421727623156796e-07, + "loss": 0.6369, + "step": 6784 + }, + { + "epoch": 0.87, + "grad_norm": 1.5608009099960327, + "learning_rate": 4.41319996626241e-07, + "loss": 0.5731, + "step": 6785 + }, + { + "epoch": 0.87, + "grad_norm": 1.482822299003601, + "learning_rate": 4.404680160767727e-07, + "loss": 0.633, + "step": 6786 + }, + { + "epoch": 0.87, + "grad_norm": 1.300209879875183, + "learning_rate": 4.3961682081401393e-07, + "loss": 0.615, + "step": 6787 + }, + { + "epoch": 0.87, + "grad_norm": 1.7004002332687378, + "learning_rate": 4.3876641098456574e-07, + "loss": 0.5625, + "step": 6788 + }, + { + "epoch": 0.87, + "grad_norm": 2.4036426544189453, + "learning_rate": 4.379167867348916e-07, + "loss": 0.6332, + "step": 6789 + }, + { + "epoch": 0.87, + "grad_norm": 2.266597032546997, + "learning_rate": 4.370679482113227e-07, + "loss": 0.6741, + "step": 6790 + }, + { + "epoch": 0.87, + "grad_norm": 1.4129186868667603, + "learning_rate": 4.362198955600561e-07, + "loss": 0.6334, + "step": 6791 + }, + { + "epoch": 0.87, + "grad_norm": 1.2967772483825684, + "learning_rate": 4.3537262892714926e-07, + "loss": 0.7026, + "step": 6792 + }, + { + "epoch": 0.87, + "grad_norm": 1.3166394233703613, + "learning_rate": 4.345261484585273e-07, + "loss": 0.5694, + "step": 6793 + }, + { + "epoch": 0.87, + "grad_norm": 1.5502707958221436, + "learning_rate": 4.3368045429997953e-07, + "loss": 0.5651, + "step": 6794 + }, + { + "epoch": 0.87, + "grad_norm": 1.5218241214752197, + "learning_rate": 4.3283554659716066e-07, + "loss": 0.6583, + "step": 6795 + }, + { + "epoch": 0.87, + "grad_norm": 1.3967206478118896, + "learning_rate": 4.3199142549558704e-07, + "loss": 0.5456, + "step": 6796 + }, + { + "epoch": 0.87, + "grad_norm": 1.2347934246063232, + "learning_rate": 4.3114809114064183e-07, + "loss": 0.5805, + "step": 6797 + }, + { + "epoch": 0.87, + "grad_norm": 1.6478691101074219, + "learning_rate": 4.303055436775727e-07, + "loss": 0.5988, + "step": 6798 + }, + { + "epoch": 0.87, + "grad_norm": 2.3431971073150635, + "learning_rate": 4.2946378325149196e-07, + "loss": 0.5429, + "step": 6799 + }, + { + "epoch": 0.87, + "grad_norm": 1.5060168504714966, + "learning_rate": 4.286228100073742e-07, + "loss": 0.5957, + "step": 6800 + }, + { + "epoch": 0.87, + "grad_norm": 1.3222039937973022, + "learning_rate": 4.2778262409006064e-07, + "loss": 0.5746, + "step": 6801 + }, + { + "epoch": 0.87, + "grad_norm": 1.2762765884399414, + "learning_rate": 4.2694322564425626e-07, + "loss": 0.5936, + "step": 6802 + }, + { + "epoch": 0.87, + "grad_norm": 1.3855468034744263, + "learning_rate": 4.261046148145315e-07, + "loss": 0.6254, + "step": 6803 + }, + { + "epoch": 0.87, + "grad_norm": 1.342413306236267, + "learning_rate": 4.2526679174531737e-07, + "loss": 0.6038, + "step": 6804 + }, + { + "epoch": 0.87, + "grad_norm": 1.30988609790802, + "learning_rate": 4.244297565809136e-07, + "loss": 0.5995, + "step": 6805 + }, + { + "epoch": 0.87, + "grad_norm": 1.4815443754196167, + "learning_rate": 4.235935094654836e-07, + "loss": 0.6256, + "step": 6806 + }, + { + "epoch": 0.87, + "grad_norm": 1.2580639123916626, + "learning_rate": 4.2275805054305175e-07, + "loss": 0.5472, + "step": 6807 + }, + { + "epoch": 0.87, + "grad_norm": 1.2893187999725342, + "learning_rate": 4.219233799575101e-07, + "loss": 0.6253, + "step": 6808 + }, + { + "epoch": 0.87, + "grad_norm": 1.2105703353881836, + "learning_rate": 4.210894978526131e-07, + "loss": 0.5719, + "step": 6809 + }, + { + "epoch": 0.87, + "grad_norm": 1.4485310316085815, + "learning_rate": 4.202564043719809e-07, + "loss": 0.6083, + "step": 6810 + }, + { + "epoch": 0.87, + "grad_norm": 1.28269624710083, + "learning_rate": 4.194240996590959e-07, + "loss": 0.5364, + "step": 6811 + }, + { + "epoch": 0.87, + "grad_norm": 1.2909283638000488, + "learning_rate": 4.185925838573057e-07, + "loss": 0.6054, + "step": 6812 + }, + { + "epoch": 0.87, + "grad_norm": 1.3592783212661743, + "learning_rate": 4.1776185710982244e-07, + "loss": 0.6188, + "step": 6813 + }, + { + "epoch": 0.87, + "grad_norm": 1.3486402034759521, + "learning_rate": 4.169319195597227e-07, + "loss": 0.6326, + "step": 6814 + }, + { + "epoch": 0.87, + "grad_norm": 1.3511604070663452, + "learning_rate": 4.161027713499444e-07, + "loss": 0.6333, + "step": 6815 + }, + { + "epoch": 0.87, + "grad_norm": 1.4477179050445557, + "learning_rate": 4.152744126232927e-07, + "loss": 0.5499, + "step": 6816 + }, + { + "epoch": 0.87, + "grad_norm": 1.2189664840698242, + "learning_rate": 4.1444684352243525e-07, + "loss": 0.5236, + "step": 6817 + }, + { + "epoch": 0.87, + "grad_norm": 1.2309170961380005, + "learning_rate": 4.136200641899052e-07, + "loss": 0.5513, + "step": 6818 + }, + { + "epoch": 0.87, + "grad_norm": 2.251579761505127, + "learning_rate": 4.12794074768097e-07, + "loss": 0.5862, + "step": 6819 + }, + { + "epoch": 0.87, + "grad_norm": 1.47301185131073, + "learning_rate": 4.119688753992707e-07, + "loss": 0.5965, + "step": 6820 + }, + { + "epoch": 0.87, + "grad_norm": 1.6270514726638794, + "learning_rate": 4.111444662255498e-07, + "loss": 0.5907, + "step": 6821 + }, + { + "epoch": 0.87, + "grad_norm": 1.4491541385650635, + "learning_rate": 4.103208473889231e-07, + "loss": 0.519, + "step": 6822 + }, + { + "epoch": 0.87, + "grad_norm": 1.2944505214691162, + "learning_rate": 4.09498019031242e-07, + "loss": 0.6209, + "step": 6823 + }, + { + "epoch": 0.87, + "grad_norm": 1.4557186365127563, + "learning_rate": 4.0867598129422146e-07, + "loss": 0.6311, + "step": 6824 + }, + { + "epoch": 0.87, + "grad_norm": 1.2103979587554932, + "learning_rate": 4.0785473431944165e-07, + "loss": 0.5814, + "step": 6825 + }, + { + "epoch": 0.87, + "grad_norm": 1.428792953491211, + "learning_rate": 4.070342782483444e-07, + "loss": 0.5717, + "step": 6826 + }, + { + "epoch": 0.87, + "grad_norm": 1.3763489723205566, + "learning_rate": 4.062146132222372e-07, + "loss": 0.551, + "step": 6827 + }, + { + "epoch": 0.87, + "grad_norm": 1.3854894638061523, + "learning_rate": 4.053957393822905e-07, + "loss": 0.5466, + "step": 6828 + }, + { + "epoch": 0.87, + "grad_norm": 1.4803504943847656, + "learning_rate": 4.045776568695398e-07, + "loss": 0.6583, + "step": 6829 + }, + { + "epoch": 0.88, + "grad_norm": 1.1337047815322876, + "learning_rate": 4.037603658248812e-07, + "loss": 0.6015, + "step": 6830 + }, + { + "epoch": 0.88, + "grad_norm": 1.2642403841018677, + "learning_rate": 4.029438663890778e-07, + "loss": 0.6461, + "step": 6831 + }, + { + "epoch": 0.88, + "grad_norm": 1.5304993391036987, + "learning_rate": 4.021281587027548e-07, + "loss": 0.509, + "step": 6832 + }, + { + "epoch": 0.88, + "grad_norm": 1.3939540386199951, + "learning_rate": 4.0131324290640206e-07, + "loss": 0.5702, + "step": 6833 + }, + { + "epoch": 0.88, + "grad_norm": 1.4129104614257812, + "learning_rate": 4.0049911914037067e-07, + "loss": 0.6004, + "step": 6834 + }, + { + "epoch": 0.88, + "grad_norm": 1.4573888778686523, + "learning_rate": 3.9968578754487784e-07, + "loss": 0.6195, + "step": 6835 + }, + { + "epoch": 0.88, + "grad_norm": 1.4409502744674683, + "learning_rate": 3.988732482600033e-07, + "loss": 0.556, + "step": 6836 + }, + { + "epoch": 0.88, + "grad_norm": 1.4219567775726318, + "learning_rate": 3.9806150142569e-07, + "loss": 0.6177, + "step": 6837 + }, + { + "epoch": 0.88, + "grad_norm": 1.3022994995117188, + "learning_rate": 3.972505471817445e-07, + "loss": 0.5414, + "step": 6838 + }, + { + "epoch": 0.88, + "grad_norm": 1.4800045490264893, + "learning_rate": 3.9644038566783946e-07, + "loss": 0.6664, + "step": 6839 + }, + { + "epoch": 0.88, + "grad_norm": 1.3413270711898804, + "learning_rate": 3.9563101702350616e-07, + "loss": 0.5806, + "step": 6840 + }, + { + "epoch": 0.88, + "grad_norm": 1.4288663864135742, + "learning_rate": 3.9482244138814295e-07, + "loss": 0.5744, + "step": 6841 + }, + { + "epoch": 0.88, + "grad_norm": 1.5452122688293457, + "learning_rate": 3.940146589010108e-07, + "loss": 0.5657, + "step": 6842 + }, + { + "epoch": 0.88, + "grad_norm": 1.9402995109558105, + "learning_rate": 3.9320766970123383e-07, + "loss": 0.5839, + "step": 6843 + }, + { + "epoch": 0.88, + "grad_norm": 1.1517301797866821, + "learning_rate": 3.924014739277987e-07, + "loss": 0.5033, + "step": 6844 + }, + { + "epoch": 0.88, + "grad_norm": 1.396001935005188, + "learning_rate": 3.915960717195566e-07, + "loss": 0.6478, + "step": 6845 + }, + { + "epoch": 0.88, + "grad_norm": 1.4981609582901, + "learning_rate": 3.907914632152215e-07, + "loss": 0.602, + "step": 6846 + }, + { + "epoch": 0.88, + "grad_norm": 1.3975633382797241, + "learning_rate": 3.8998764855337266e-07, + "loss": 0.5397, + "step": 6847 + }, + { + "epoch": 0.88, + "grad_norm": 1.1856523752212524, + "learning_rate": 3.8918462787244817e-07, + "loss": 0.5272, + "step": 6848 + }, + { + "epoch": 0.88, + "grad_norm": 1.2696727514266968, + "learning_rate": 3.8838240131075343e-07, + "loss": 0.5655, + "step": 6849 + }, + { + "epoch": 0.88, + "grad_norm": 1.603298306465149, + "learning_rate": 3.8758096900645524e-07, + "loss": 0.522, + "step": 6850 + }, + { + "epoch": 0.88, + "grad_norm": 1.3075599670410156, + "learning_rate": 3.867803310975854e-07, + "loss": 0.5532, + "step": 6851 + }, + { + "epoch": 0.88, + "grad_norm": 1.4688489437103271, + "learning_rate": 3.859804877220352e-07, + "loss": 0.5474, + "step": 6852 + }, + { + "epoch": 0.88, + "grad_norm": 1.316911220550537, + "learning_rate": 3.851814390175623e-07, + "loss": 0.6715, + "step": 6853 + }, + { + "epoch": 0.88, + "grad_norm": 1.143972635269165, + "learning_rate": 3.843831851217872e-07, + "loss": 0.7013, + "step": 6854 + }, + { + "epoch": 0.88, + "grad_norm": 1.1683162450790405, + "learning_rate": 3.835857261721926e-07, + "loss": 0.5927, + "step": 6855 + }, + { + "epoch": 0.88, + "grad_norm": 1.6495614051818848, + "learning_rate": 3.827890623061242e-07, + "loss": 0.6061, + "step": 6856 + }, + { + "epoch": 0.88, + "grad_norm": 1.4121828079223633, + "learning_rate": 3.8199319366079177e-07, + "loss": 0.5579, + "step": 6857 + }, + { + "epoch": 0.88, + "grad_norm": 1.4523359537124634, + "learning_rate": 3.811981203732684e-07, + "loss": 0.5648, + "step": 6858 + }, + { + "epoch": 0.88, + "grad_norm": 1.28249990940094, + "learning_rate": 3.8040384258048677e-07, + "loss": 0.6971, + "step": 6859 + }, + { + "epoch": 0.88, + "grad_norm": 1.5575233697891235, + "learning_rate": 3.7961036041924635e-07, + "loss": 0.5897, + "step": 6860 + }, + { + "epoch": 0.88, + "grad_norm": 1.302139163017273, + "learning_rate": 3.788176740262089e-07, + "loss": 0.6636, + "step": 6861 + }, + { + "epoch": 0.88, + "grad_norm": 1.1842310428619385, + "learning_rate": 3.7802578353789864e-07, + "loss": 0.5368, + "step": 6862 + }, + { + "epoch": 0.88, + "grad_norm": 1.4447256326675415, + "learning_rate": 3.7723468909070136e-07, + "loss": 0.5266, + "step": 6863 + }, + { + "epoch": 0.88, + "grad_norm": 1.4478634595870972, + "learning_rate": 3.764443908208676e-07, + "loss": 0.5914, + "step": 6864 + }, + { + "epoch": 0.88, + "grad_norm": 1.580125093460083, + "learning_rate": 3.7565488886451004e-07, + "loss": 0.5411, + "step": 6865 + }, + { + "epoch": 0.88, + "grad_norm": 1.5011149644851685, + "learning_rate": 3.748661833576056e-07, + "loss": 0.6147, + "step": 6866 + }, + { + "epoch": 0.88, + "grad_norm": 1.561859369277954, + "learning_rate": 3.740782744359911e-07, + "loss": 0.6672, + "step": 6867 + }, + { + "epoch": 0.88, + "grad_norm": 1.3884916305541992, + "learning_rate": 3.7329116223536797e-07, + "loss": 0.6525, + "step": 6868 + }, + { + "epoch": 0.88, + "grad_norm": 1.4261103868484497, + "learning_rate": 3.7250484689130115e-07, + "loss": 0.5723, + "step": 6869 + }, + { + "epoch": 0.88, + "grad_norm": 1.4831106662750244, + "learning_rate": 3.717193285392179e-07, + "loss": 0.5859, + "step": 6870 + }, + { + "epoch": 0.88, + "grad_norm": 1.6929248571395874, + "learning_rate": 3.70934607314406e-07, + "loss": 0.6201, + "step": 6871 + }, + { + "epoch": 0.88, + "grad_norm": 1.2150589227676392, + "learning_rate": 3.70150683352018e-07, + "loss": 0.608, + "step": 6872 + }, + { + "epoch": 0.88, + "grad_norm": 1.2132083177566528, + "learning_rate": 3.693675567870714e-07, + "loss": 0.5384, + "step": 6873 + }, + { + "epoch": 0.88, + "grad_norm": 1.33145010471344, + "learning_rate": 3.685852277544405e-07, + "loss": 0.6047, + "step": 6874 + }, + { + "epoch": 0.88, + "grad_norm": 1.2606712579727173, + "learning_rate": 3.678036963888676e-07, + "loss": 0.6038, + "step": 6875 + }, + { + "epoch": 0.88, + "grad_norm": 1.2835811376571655, + "learning_rate": 3.670229628249555e-07, + "loss": 0.5235, + "step": 6876 + }, + { + "epoch": 0.88, + "grad_norm": 1.5742946863174438, + "learning_rate": 3.662430271971695e-07, + "loss": 0.654, + "step": 6877 + }, + { + "epoch": 0.88, + "grad_norm": 1.2673202753067017, + "learning_rate": 3.6546388963983716e-07, + "loss": 0.529, + "step": 6878 + }, + { + "epoch": 0.88, + "grad_norm": 1.531229019165039, + "learning_rate": 3.646855502871488e-07, + "loss": 0.5374, + "step": 6879 + }, + { + "epoch": 0.88, + "grad_norm": 1.3460601568222046, + "learning_rate": 3.639080092731584e-07, + "loss": 0.5097, + "step": 6880 + }, + { + "epoch": 0.88, + "grad_norm": 1.3324980735778809, + "learning_rate": 3.6313126673178213e-07, + "loss": 0.547, + "step": 6881 + }, + { + "epoch": 0.88, + "grad_norm": 1.4404997825622559, + "learning_rate": 3.623553227967963e-07, + "loss": 0.5873, + "step": 6882 + }, + { + "epoch": 0.88, + "grad_norm": 1.44928777217865, + "learning_rate": 3.6158017760184237e-07, + "loss": 0.5764, + "step": 6883 + }, + { + "epoch": 0.88, + "grad_norm": 1.1881428956985474, + "learning_rate": 3.608058312804247e-07, + "loss": 0.6388, + "step": 6884 + }, + { + "epoch": 0.88, + "grad_norm": 1.5132038593292236, + "learning_rate": 3.600322839659065e-07, + "loss": 0.5741, + "step": 6885 + }, + { + "epoch": 0.88, + "grad_norm": 2.413456439971924, + "learning_rate": 3.592595357915163e-07, + "loss": 0.6382, + "step": 6886 + }, + { + "epoch": 0.88, + "grad_norm": 1.2886861562728882, + "learning_rate": 3.584875868903448e-07, + "loss": 0.5972, + "step": 6887 + }, + { + "epoch": 0.88, + "grad_norm": 1.2081990242004395, + "learning_rate": 3.577164373953446e-07, + "loss": 0.5572, + "step": 6888 + }, + { + "epoch": 0.88, + "grad_norm": 1.4857827425003052, + "learning_rate": 3.569460874393288e-07, + "loss": 0.5606, + "step": 6889 + }, + { + "epoch": 0.88, + "grad_norm": 1.3529541492462158, + "learning_rate": 3.561765371549769e-07, + "loss": 0.5588, + "step": 6890 + }, + { + "epoch": 0.88, + "grad_norm": 1.1445674896240234, + "learning_rate": 3.554077866748279e-07, + "loss": 0.5352, + "step": 6891 + }, + { + "epoch": 0.88, + "grad_norm": 1.4179437160491943, + "learning_rate": 3.5463983613128136e-07, + "loss": 0.5433, + "step": 6892 + }, + { + "epoch": 0.88, + "grad_norm": 1.1696901321411133, + "learning_rate": 3.5387268565660324e-07, + "loss": 0.5855, + "step": 6893 + }, + { + "epoch": 0.88, + "grad_norm": 1.259205937385559, + "learning_rate": 3.5310633538291894e-07, + "loss": 0.5802, + "step": 6894 + }, + { + "epoch": 0.88, + "grad_norm": 1.2908961772918701, + "learning_rate": 3.523407854422173e-07, + "loss": 0.6712, + "step": 6895 + }, + { + "epoch": 0.88, + "grad_norm": 1.3714510202407837, + "learning_rate": 3.5157603596634727e-07, + "loss": 0.6665, + "step": 6896 + }, + { + "epoch": 0.88, + "grad_norm": 1.2138036489486694, + "learning_rate": 3.508120870870224e-07, + "loss": 0.6244, + "step": 6897 + }, + { + "epoch": 0.88, + "grad_norm": 1.4784094095230103, + "learning_rate": 3.5004893893581746e-07, + "loss": 0.6091, + "step": 6898 + }, + { + "epoch": 0.88, + "grad_norm": 1.4526443481445312, + "learning_rate": 3.4928659164416956e-07, + "loss": 0.6336, + "step": 6899 + }, + { + "epoch": 0.88, + "grad_norm": 1.4262062311172485, + "learning_rate": 3.485250453433764e-07, + "loss": 0.5957, + "step": 6900 + }, + { + "epoch": 0.88, + "grad_norm": 1.7060856819152832, + "learning_rate": 3.4776430016459917e-07, + "loss": 0.5559, + "step": 6901 + }, + { + "epoch": 0.88, + "grad_norm": 1.9785739183425903, + "learning_rate": 3.4700435623886143e-07, + "loss": 0.6409, + "step": 6902 + }, + { + "epoch": 0.88, + "grad_norm": 1.693240761756897, + "learning_rate": 3.462452136970479e-07, + "loss": 0.6472, + "step": 6903 + }, + { + "epoch": 0.88, + "grad_norm": 1.6329549551010132, + "learning_rate": 3.4548687266990453e-07, + "loss": 0.5792, + "step": 6904 + }, + { + "epoch": 0.88, + "grad_norm": 1.6811611652374268, + "learning_rate": 3.4472933328804134e-07, + "loss": 0.5417, + "step": 6905 + }, + { + "epoch": 0.88, + "grad_norm": 1.2126712799072266, + "learning_rate": 3.439725956819284e-07, + "loss": 0.5035, + "step": 6906 + }, + { + "epoch": 0.88, + "grad_norm": 1.4418469667434692, + "learning_rate": 3.4321665998189814e-07, + "loss": 0.5954, + "step": 6907 + }, + { + "epoch": 0.89, + "grad_norm": 1.128082275390625, + "learning_rate": 3.424615263181458e-07, + "loss": 0.5779, + "step": 6908 + }, + { + "epoch": 0.89, + "grad_norm": 1.4310845136642456, + "learning_rate": 3.417071948207273e-07, + "loss": 0.505, + "step": 6909 + }, + { + "epoch": 0.89, + "grad_norm": 1.5503251552581787, + "learning_rate": 3.409536656195628e-07, + "loss": 0.6519, + "step": 6910 + }, + { + "epoch": 0.89, + "grad_norm": 1.8439539670944214, + "learning_rate": 3.4020093884442885e-07, + "loss": 0.5995, + "step": 6911 + }, + { + "epoch": 0.89, + "grad_norm": 1.237624168395996, + "learning_rate": 3.3944901462497014e-07, + "loss": 0.4917, + "step": 6912 + }, + { + "epoch": 0.89, + "grad_norm": 1.3327163457870483, + "learning_rate": 3.3869789309068866e-07, + "loss": 0.5526, + "step": 6913 + }, + { + "epoch": 0.89, + "grad_norm": 1.702526330947876, + "learning_rate": 3.3794757437095206e-07, + "loss": 0.6228, + "step": 6914 + }, + { + "epoch": 0.89, + "grad_norm": 1.312321424484253, + "learning_rate": 3.3719805859498466e-07, + "loss": 0.6137, + "step": 6915 + }, + { + "epoch": 0.89, + "grad_norm": 1.5142812728881836, + "learning_rate": 3.364493458918772e-07, + "loss": 0.5638, + "step": 6916 + }, + { + "epoch": 0.89, + "grad_norm": 1.370383858680725, + "learning_rate": 3.3570143639057916e-07, + "loss": 0.5413, + "step": 6917 + }, + { + "epoch": 0.89, + "grad_norm": 1.4188882112503052, + "learning_rate": 3.349543302199043e-07, + "loss": 0.5455, + "step": 6918 + }, + { + "epoch": 0.89, + "grad_norm": 1.3100755214691162, + "learning_rate": 3.342080275085252e-07, + "loss": 0.6094, + "step": 6919 + }, + { + "epoch": 0.89, + "grad_norm": 1.6879796981811523, + "learning_rate": 3.3346252838497683e-07, + "loss": 0.6627, + "step": 6920 + }, + { + "epoch": 0.89, + "grad_norm": 1.2003105878829956, + "learning_rate": 3.327178329776576e-07, + "loss": 0.6109, + "step": 6921 + }, + { + "epoch": 0.89, + "grad_norm": 1.4838978052139282, + "learning_rate": 3.31973941414826e-07, + "loss": 0.5857, + "step": 6922 + }, + { + "epoch": 0.89, + "grad_norm": 1.5608856678009033, + "learning_rate": 3.312308538246006e-07, + "loss": 0.6024, + "step": 6923 + }, + { + "epoch": 0.89, + "grad_norm": 2.857037305831909, + "learning_rate": 3.3048857033496473e-07, + "loss": 0.5997, + "step": 6924 + }, + { + "epoch": 0.89, + "grad_norm": 1.4345684051513672, + "learning_rate": 3.2974709107376215e-07, + "loss": 0.6384, + "step": 6925 + }, + { + "epoch": 0.89, + "grad_norm": 1.6866978406906128, + "learning_rate": 3.290064161686962e-07, + "loss": 0.5539, + "step": 6926 + }, + { + "epoch": 0.89, + "grad_norm": 1.3979156017303467, + "learning_rate": 3.282665457473333e-07, + "loss": 0.6002, + "step": 6927 + }, + { + "epoch": 0.89, + "grad_norm": 1.108978033065796, + "learning_rate": 3.2752747993710144e-07, + "loss": 0.7341, + "step": 6928 + }, + { + "epoch": 0.89, + "grad_norm": 1.3866137266159058, + "learning_rate": 3.267892188652905e-07, + "loss": 0.6452, + "step": 6929 + }, + { + "epoch": 0.89, + "grad_norm": 1.1434195041656494, + "learning_rate": 3.2605176265904925e-07, + "loss": 0.6161, + "step": 6930 + }, + { + "epoch": 0.89, + "grad_norm": 1.0713568925857544, + "learning_rate": 3.253151114453901e-07, + "loss": 0.6877, + "step": 6931 + }, + { + "epoch": 0.89, + "grad_norm": 1.3612641096115112, + "learning_rate": 3.245792653511876e-07, + "loss": 0.5224, + "step": 6932 + }, + { + "epoch": 0.89, + "grad_norm": 1.527736783027649, + "learning_rate": 3.238442245031742e-07, + "loss": 0.6014, + "step": 6933 + }, + { + "epoch": 0.89, + "grad_norm": 1.1294968128204346, + "learning_rate": 3.2310998902794653e-07, + "loss": 0.7599, + "step": 6934 + }, + { + "epoch": 0.89, + "grad_norm": 1.4158496856689453, + "learning_rate": 3.223765590519623e-07, + "loss": 0.5286, + "step": 6935 + }, + { + "epoch": 0.89, + "grad_norm": 1.530761957168579, + "learning_rate": 3.216439347015399e-07, + "loss": 0.5673, + "step": 6936 + }, + { + "epoch": 0.89, + "grad_norm": 1.185805320739746, + "learning_rate": 3.209121161028583e-07, + "loss": 0.5242, + "step": 6937 + }, + { + "epoch": 0.89, + "grad_norm": 1.3359084129333496, + "learning_rate": 3.201811033819585e-07, + "loss": 0.5777, + "step": 6938 + }, + { + "epoch": 0.89, + "grad_norm": 2.525627613067627, + "learning_rate": 3.194508966647425e-07, + "loss": 0.5719, + "step": 6939 + }, + { + "epoch": 0.89, + "grad_norm": 1.422498345375061, + "learning_rate": 3.1872149607697466e-07, + "loss": 0.5689, + "step": 6940 + }, + { + "epoch": 0.89, + "grad_norm": 1.4468578100204468, + "learning_rate": 3.179929017442773e-07, + "loss": 0.5871, + "step": 6941 + }, + { + "epoch": 0.89, + "grad_norm": 1.309499979019165, + "learning_rate": 3.1726511379213784e-07, + "loss": 0.6147, + "step": 6942 + }, + { + "epoch": 0.89, + "grad_norm": 2.29915189743042, + "learning_rate": 3.1653813234590327e-07, + "loss": 0.6435, + "step": 6943 + }, + { + "epoch": 0.89, + "grad_norm": 1.3363467454910278, + "learning_rate": 3.1581195753078e-07, + "loss": 0.5975, + "step": 6944 + }, + { + "epoch": 0.89, + "grad_norm": 1.5139838457107544, + "learning_rate": 3.150865894718369e-07, + "loss": 0.6698, + "step": 6945 + }, + { + "epoch": 0.89, + "grad_norm": 1.194031834602356, + "learning_rate": 3.143620282940046e-07, + "loss": 0.5815, + "step": 6946 + }, + { + "epoch": 0.89, + "grad_norm": 1.3560861349105835, + "learning_rate": 3.136382741220745e-07, + "loss": 0.4936, + "step": 6947 + }, + { + "epoch": 0.89, + "grad_norm": 1.2448467016220093, + "learning_rate": 3.1291532708069727e-07, + "loss": 0.575, + "step": 6948 + }, + { + "epoch": 0.89, + "grad_norm": 1.4894511699676514, + "learning_rate": 3.121931872943862e-07, + "loss": 0.6252, + "step": 6949 + }, + { + "epoch": 0.89, + "grad_norm": 1.2867622375488281, + "learning_rate": 3.1147185488751506e-07, + "loss": 0.641, + "step": 6950 + }, + { + "epoch": 0.89, + "grad_norm": 1.371701955795288, + "learning_rate": 3.107513299843201e-07, + "loss": 0.6279, + "step": 6951 + }, + { + "epoch": 0.89, + "grad_norm": 1.264610767364502, + "learning_rate": 3.100316127088954e-07, + "loss": 0.6091, + "step": 6952 + }, + { + "epoch": 0.89, + "grad_norm": 1.2578670978546143, + "learning_rate": 3.093127031851978e-07, + "loss": 0.5467, + "step": 6953 + }, + { + "epoch": 0.89, + "grad_norm": 1.3644834756851196, + "learning_rate": 3.0859460153704557e-07, + "loss": 0.5687, + "step": 6954 + }, + { + "epoch": 0.89, + "grad_norm": 1.1062878370285034, + "learning_rate": 3.0787730788811807e-07, + "loss": 0.5197, + "step": 6955 + }, + { + "epoch": 0.89, + "grad_norm": 1.430008888244629, + "learning_rate": 3.0716082236195213e-07, + "loss": 0.5242, + "step": 6956 + }, + { + "epoch": 0.89, + "grad_norm": 1.4670532941818237, + "learning_rate": 3.064451450819489e-07, + "loss": 0.5455, + "step": 6957 + }, + { + "epoch": 0.89, + "grad_norm": 1.1277669668197632, + "learning_rate": 3.057302761713693e-07, + "loss": 0.7229, + "step": 6958 + }, + { + "epoch": 0.89, + "grad_norm": 1.744103193283081, + "learning_rate": 3.050162157533354e-07, + "loss": 0.5868, + "step": 6959 + }, + { + "epoch": 0.89, + "grad_norm": 1.5596626996994019, + "learning_rate": 3.0430296395082883e-07, + "loss": 0.643, + "step": 6960 + }, + { + "epoch": 0.89, + "grad_norm": 1.3399211168289185, + "learning_rate": 3.035905208866935e-07, + "loss": 0.6114, + "step": 6961 + }, + { + "epoch": 0.89, + "grad_norm": 2.7940516471862793, + "learning_rate": 3.028788866836335e-07, + "loss": 0.6472, + "step": 6962 + }, + { + "epoch": 0.89, + "grad_norm": 1.4219404458999634, + "learning_rate": 3.02168061464212e-07, + "loss": 0.5366, + "step": 6963 + }, + { + "epoch": 0.89, + "grad_norm": 1.567854881286621, + "learning_rate": 3.0145804535085476e-07, + "loss": 0.5771, + "step": 6964 + }, + { + "epoch": 0.89, + "grad_norm": 1.4092823266983032, + "learning_rate": 3.00748838465848e-07, + "loss": 0.5056, + "step": 6965 + }, + { + "epoch": 0.89, + "grad_norm": 1.33809494972229, + "learning_rate": 3.0004044093133844e-07, + "loss": 0.5536, + "step": 6966 + }, + { + "epoch": 0.89, + "grad_norm": 1.3685873746871948, + "learning_rate": 2.9933285286933177e-07, + "loss": 0.6435, + "step": 6967 + }, + { + "epoch": 0.89, + "grad_norm": 1.6785255670547485, + "learning_rate": 2.9862607440169724e-07, + "loss": 0.5749, + "step": 6968 + }, + { + "epoch": 0.89, + "grad_norm": 1.203216314315796, + "learning_rate": 2.979201056501618e-07, + "loss": 0.5402, + "step": 6969 + }, + { + "epoch": 0.89, + "grad_norm": 1.2036579847335815, + "learning_rate": 2.972149467363161e-07, + "loss": 0.5646, + "step": 6970 + }, + { + "epoch": 0.89, + "grad_norm": 1.28584623336792, + "learning_rate": 2.9651059778160685e-07, + "loss": 0.6238, + "step": 6971 + }, + { + "epoch": 0.89, + "grad_norm": 1.2839908599853516, + "learning_rate": 2.958070589073453e-07, + "loss": 0.6049, + "step": 6972 + }, + { + "epoch": 0.89, + "grad_norm": 1.457728624343872, + "learning_rate": 2.9510433023470174e-07, + "loss": 0.5392, + "step": 6973 + }, + { + "epoch": 0.89, + "grad_norm": 1.3485701084136963, + "learning_rate": 2.9440241188470717e-07, + "loss": 0.6339, + "step": 6974 + }, + { + "epoch": 0.89, + "grad_norm": 1.3882330656051636, + "learning_rate": 2.93701303978251e-07, + "loss": 0.561, + "step": 6975 + }, + { + "epoch": 0.89, + "grad_norm": 1.0906554460525513, + "learning_rate": 2.930010066360872e-07, + "loss": 0.5579, + "step": 6976 + }, + { + "epoch": 0.89, + "grad_norm": 1.292925238609314, + "learning_rate": 2.9230151997882703e-07, + "loss": 0.5328, + "step": 6977 + }, + { + "epoch": 0.89, + "grad_norm": 1.570992350578308, + "learning_rate": 2.9160284412694195e-07, + "loss": 0.5696, + "step": 6978 + }, + { + "epoch": 0.89, + "grad_norm": 1.475880742073059, + "learning_rate": 2.909049792007651e-07, + "loss": 0.6416, + "step": 6979 + }, + { + "epoch": 0.89, + "grad_norm": 1.4461641311645508, + "learning_rate": 2.902079253204898e-07, + "loss": 0.577, + "step": 6980 + }, + { + "epoch": 0.89, + "grad_norm": 1.4062169790267944, + "learning_rate": 2.8951168260617004e-07, + "loss": 0.5309, + "step": 6981 + }, + { + "epoch": 0.89, + "grad_norm": 1.2815173864364624, + "learning_rate": 2.888162511777176e-07, + "loss": 0.6191, + "step": 6982 + }, + { + "epoch": 0.89, + "grad_norm": 1.3789962530136108, + "learning_rate": 2.881216311549079e-07, + "loss": 0.5828, + "step": 6983 + }, + { + "epoch": 0.89, + "grad_norm": 1.1321500539779663, + "learning_rate": 2.8742782265737514e-07, + "loss": 0.7302, + "step": 6984 + }, + { + "epoch": 0.89, + "grad_norm": 1.0742570161819458, + "learning_rate": 2.8673482580461264e-07, + "loss": 0.5523, + "step": 6985 + }, + { + "epoch": 0.9, + "grad_norm": 1.1808911561965942, + "learning_rate": 2.8604264071597607e-07, + "loss": 0.5198, + "step": 6986 + }, + { + "epoch": 0.9, + "grad_norm": 1.1899702548980713, + "learning_rate": 2.8535126751067954e-07, + "loss": 0.7027, + "step": 6987 + }, + { + "epoch": 0.9, + "grad_norm": 1.4160841703414917, + "learning_rate": 2.846607063077994e-07, + "loss": 0.624, + "step": 6988 + }, + { + "epoch": 0.9, + "grad_norm": 1.3612381219863892, + "learning_rate": 2.8397095722626833e-07, + "loss": 0.6166, + "step": 6989 + }, + { + "epoch": 0.9, + "grad_norm": 1.779995083808899, + "learning_rate": 2.832820203848835e-07, + "loss": 0.6595, + "step": 6990 + }, + { + "epoch": 0.9, + "grad_norm": 1.5072723627090454, + "learning_rate": 2.8259389590230003e-07, + "loss": 0.6803, + "step": 6991 + }, + { + "epoch": 0.9, + "grad_norm": 1.4251368045806885, + "learning_rate": 2.8190658389703304e-07, + "loss": 0.5835, + "step": 6992 + }, + { + "epoch": 0.9, + "grad_norm": 1.4748808145523071, + "learning_rate": 2.8122008448745795e-07, + "loss": 0.5532, + "step": 6993 + }, + { + "epoch": 0.9, + "grad_norm": 1.4076902866363525, + "learning_rate": 2.805343977918101e-07, + "loss": 0.5721, + "step": 6994 + }, + { + "epoch": 0.9, + "grad_norm": 1.2788536548614502, + "learning_rate": 2.798495239281868e-07, + "loss": 0.5865, + "step": 6995 + }, + { + "epoch": 0.9, + "grad_norm": 1.204859972000122, + "learning_rate": 2.7916546301454185e-07, + "loss": 0.5728, + "step": 6996 + }, + { + "epoch": 0.9, + "grad_norm": 1.7112232446670532, + "learning_rate": 2.7848221516869114e-07, + "loss": 0.6384, + "step": 6997 + }, + { + "epoch": 0.9, + "grad_norm": 1.2741093635559082, + "learning_rate": 2.7779978050830993e-07, + "loss": 0.61, + "step": 6998 + }, + { + "epoch": 0.9, + "grad_norm": 1.2119300365447998, + "learning_rate": 2.771181591509353e-07, + "loss": 0.6146, + "step": 6999 + }, + { + "epoch": 0.9, + "grad_norm": 1.110952615737915, + "learning_rate": 2.764373512139612e-07, + "loss": 0.6863, + "step": 7000 + }, + { + "epoch": 0.9, + "grad_norm": 1.2552884817123413, + "learning_rate": 2.757573568146432e-07, + "loss": 0.5632, + "step": 7001 + }, + { + "epoch": 0.9, + "grad_norm": 1.9371869564056396, + "learning_rate": 2.750781760700966e-07, + "loss": 0.6369, + "step": 7002 + }, + { + "epoch": 0.9, + "grad_norm": 3.263620138168335, + "learning_rate": 2.7439980909729716e-07, + "loss": 0.6031, + "step": 7003 + }, + { + "epoch": 0.9, + "grad_norm": 1.5019482374191284, + "learning_rate": 2.737222560130787e-07, + "loss": 0.5737, + "step": 7004 + }, + { + "epoch": 0.9, + "grad_norm": 1.525652289390564, + "learning_rate": 2.7304551693413616e-07, + "loss": 0.6343, + "step": 7005 + }, + { + "epoch": 0.9, + "grad_norm": 1.3184609413146973, + "learning_rate": 2.723695919770242e-07, + "loss": 0.6735, + "step": 7006 + }, + { + "epoch": 0.9, + "grad_norm": 1.2091474533081055, + "learning_rate": 2.7169448125815846e-07, + "loss": 0.5457, + "step": 7007 + }, + { + "epoch": 0.9, + "grad_norm": 1.3816118240356445, + "learning_rate": 2.7102018489381154e-07, + "loss": 0.6264, + "step": 7008 + }, + { + "epoch": 0.9, + "grad_norm": 1.2462316751480103, + "learning_rate": 2.7034670300011614e-07, + "loss": 0.6312, + "step": 7009 + }, + { + "epoch": 0.9, + "grad_norm": 1.240803837776184, + "learning_rate": 2.696740356930694e-07, + "loss": 0.5673, + "step": 7010 + }, + { + "epoch": 0.9, + "grad_norm": 1.3565901517868042, + "learning_rate": 2.690021830885214e-07, + "loss": 0.6344, + "step": 7011 + }, + { + "epoch": 0.9, + "grad_norm": 1.1310712099075317, + "learning_rate": 2.6833114530218694e-07, + "loss": 0.5777, + "step": 7012 + }, + { + "epoch": 0.9, + "grad_norm": 1.0522847175598145, + "learning_rate": 2.676609224496374e-07, + "loss": 0.6879, + "step": 7013 + }, + { + "epoch": 0.9, + "grad_norm": 1.6499149799346924, + "learning_rate": 2.6699151464630644e-07, + "loss": 0.6255, + "step": 7014 + }, + { + "epoch": 0.9, + "grad_norm": 1.2910633087158203, + "learning_rate": 2.663229220074842e-07, + "loss": 0.6169, + "step": 7015 + }, + { + "epoch": 0.9, + "grad_norm": 1.4426332712173462, + "learning_rate": 2.6565514464832354e-07, + "loss": 0.6411, + "step": 7016 + }, + { + "epoch": 0.9, + "grad_norm": 1.2526745796203613, + "learning_rate": 2.6498818268383465e-07, + "loss": 0.574, + "step": 7017 + }, + { + "epoch": 0.9, + "grad_norm": 1.5019805431365967, + "learning_rate": 2.6432203622888895e-07, + "loss": 0.5775, + "step": 7018 + }, + { + "epoch": 0.9, + "grad_norm": 1.2330554723739624, + "learning_rate": 2.636567053982164e-07, + "loss": 0.4841, + "step": 7019 + }, + { + "epoch": 0.9, + "grad_norm": 1.3986729383468628, + "learning_rate": 2.6299219030640586e-07, + "loss": 0.5777, + "step": 7020 + }, + { + "epoch": 0.9, + "grad_norm": 1.4264737367630005, + "learning_rate": 2.6232849106790745e-07, + "loss": 0.6921, + "step": 7021 + }, + { + "epoch": 0.9, + "grad_norm": 1.2543922662734985, + "learning_rate": 2.616656077970309e-07, + "loss": 0.5413, + "step": 7022 + }, + { + "epoch": 0.9, + "grad_norm": 1.8443100452423096, + "learning_rate": 2.6100354060794196e-07, + "loss": 0.6575, + "step": 7023 + }, + { + "epoch": 0.9, + "grad_norm": 2.33084774017334, + "learning_rate": 2.603422896146696e-07, + "loss": 0.5558, + "step": 7024 + }, + { + "epoch": 0.9, + "grad_norm": 1.9156298637390137, + "learning_rate": 2.5968185493110034e-07, + "loss": 0.581, + "step": 7025 + }, + { + "epoch": 0.9, + "grad_norm": 1.377859354019165, + "learning_rate": 2.590222366709816e-07, + "loss": 0.6449, + "step": 7026 + }, + { + "epoch": 0.9, + "grad_norm": 1.6291786432266235, + "learning_rate": 2.58363434947918e-07, + "loss": 0.5664, + "step": 7027 + }, + { + "epoch": 0.9, + "grad_norm": 1.342464566230774, + "learning_rate": 2.5770544987537616e-07, + "loss": 0.6111, + "step": 7028 + }, + { + "epoch": 0.9, + "grad_norm": 1.6001750230789185, + "learning_rate": 2.570482815666797e-07, + "loss": 0.5381, + "step": 7029 + }, + { + "epoch": 0.9, + "grad_norm": 1.5080769062042236, + "learning_rate": 2.563919301350126e-07, + "loss": 0.6536, + "step": 7030 + }, + { + "epoch": 0.9, + "grad_norm": 1.3722752332687378, + "learning_rate": 2.557363956934178e-07, + "loss": 0.543, + "step": 7031 + }, + { + "epoch": 0.9, + "grad_norm": 1.3873956203460693, + "learning_rate": 2.550816783547988e-07, + "loss": 0.521, + "step": 7032 + }, + { + "epoch": 0.9, + "grad_norm": 1.437137246131897, + "learning_rate": 2.54427778231916e-07, + "loss": 0.5755, + "step": 7033 + }, + { + "epoch": 0.9, + "grad_norm": 1.7044881582260132, + "learning_rate": 2.53774695437391e-07, + "loss": 0.6265, + "step": 7034 + }, + { + "epoch": 0.9, + "grad_norm": 1.2608447074890137, + "learning_rate": 2.531224300837043e-07, + "loss": 0.5938, + "step": 7035 + }, + { + "epoch": 0.9, + "grad_norm": 1.528001308441162, + "learning_rate": 2.5247098228319557e-07, + "loss": 0.5902, + "step": 7036 + }, + { + "epoch": 0.9, + "grad_norm": 1.4568138122558594, + "learning_rate": 2.518203521480622e-07, + "loss": 0.5409, + "step": 7037 + }, + { + "epoch": 0.9, + "grad_norm": 1.466330647468567, + "learning_rate": 2.51170539790363e-07, + "loss": 0.5988, + "step": 7038 + }, + { + "epoch": 0.9, + "grad_norm": 1.3123421669006348, + "learning_rate": 2.50521545322015e-07, + "loss": 0.6526, + "step": 7039 + }, + { + "epoch": 0.9, + "grad_norm": 2.2476837635040283, + "learning_rate": 2.498733688547944e-07, + "loss": 0.6303, + "step": 7040 + }, + { + "epoch": 0.9, + "grad_norm": 2.1309330463409424, + "learning_rate": 2.492260105003358e-07, + "loss": 0.606, + "step": 7041 + }, + { + "epoch": 0.9, + "grad_norm": 1.2924765348434448, + "learning_rate": 2.485794703701333e-07, + "loss": 0.5738, + "step": 7042 + }, + { + "epoch": 0.9, + "grad_norm": 1.3361096382141113, + "learning_rate": 2.479337485755412e-07, + "loss": 0.6349, + "step": 7043 + }, + { + "epoch": 0.9, + "grad_norm": 1.2557562589645386, + "learning_rate": 2.4728884522777106e-07, + "loss": 0.6163, + "step": 7044 + }, + { + "epoch": 0.9, + "grad_norm": 1.16624116897583, + "learning_rate": 2.4664476043789523e-07, + "loss": 0.7221, + "step": 7045 + }, + { + "epoch": 0.9, + "grad_norm": 1.5108433961868286, + "learning_rate": 2.460014943168443e-07, + "loss": 0.6333, + "step": 7046 + }, + { + "epoch": 0.9, + "grad_norm": 1.4388067722320557, + "learning_rate": 2.4535904697540746e-07, + "loss": 0.5811, + "step": 7047 + }, + { + "epoch": 0.9, + "grad_norm": 1.2010633945465088, + "learning_rate": 2.447174185242324e-07, + "loss": 0.5747, + "step": 7048 + }, + { + "epoch": 0.9, + "grad_norm": 2.1095924377441406, + "learning_rate": 2.4407660907382727e-07, + "loss": 0.5752, + "step": 7049 + }, + { + "epoch": 0.9, + "grad_norm": 1.2455929517745972, + "learning_rate": 2.434366187345588e-07, + "loss": 0.5444, + "step": 7050 + }, + { + "epoch": 0.9, + "grad_norm": 1.5187287330627441, + "learning_rate": 2.4279744761665225e-07, + "loss": 0.4958, + "step": 7051 + }, + { + "epoch": 0.9, + "grad_norm": 1.457276463508606, + "learning_rate": 2.4215909583019117e-07, + "loss": 0.581, + "step": 7052 + }, + { + "epoch": 0.9, + "grad_norm": 1.4139649868011475, + "learning_rate": 2.4152156348511923e-07, + "loss": 0.5623, + "step": 7053 + }, + { + "epoch": 0.9, + "grad_norm": 1.4696898460388184, + "learning_rate": 2.408848506912381e-07, + "loss": 0.5736, + "step": 7054 + }, + { + "epoch": 0.9, + "grad_norm": 1.5024607181549072, + "learning_rate": 2.4024895755820956e-07, + "loss": 0.6737, + "step": 7055 + }, + { + "epoch": 0.9, + "grad_norm": 1.4502644538879395, + "learning_rate": 2.3961388419555145e-07, + "loss": 0.5557, + "step": 7056 + }, + { + "epoch": 0.9, + "grad_norm": 1.718114972114563, + "learning_rate": 2.389796307126441e-07, + "loss": 0.5784, + "step": 7057 + }, + { + "epoch": 0.9, + "grad_norm": 1.5058298110961914, + "learning_rate": 2.3834619721872342e-07, + "loss": 0.5671, + "step": 7058 + }, + { + "epoch": 0.9, + "grad_norm": 1.4167766571044922, + "learning_rate": 2.3771358382288666e-07, + "loss": 0.5516, + "step": 7059 + }, + { + "epoch": 0.9, + "grad_norm": 1.5127261877059937, + "learning_rate": 2.370817906340872e-07, + "loss": 0.6122, + "step": 7060 + }, + { + "epoch": 0.9, + "grad_norm": 1.1658446788787842, + "learning_rate": 2.3645081776113965e-07, + "loss": 0.7223, + "step": 7061 + }, + { + "epoch": 0.9, + "grad_norm": 1.3336877822875977, + "learning_rate": 2.3582066531271708e-07, + "loss": 0.5896, + "step": 7062 + }, + { + "epoch": 0.9, + "grad_norm": 1.4371452331542969, + "learning_rate": 2.3519133339734823e-07, + "loss": 0.5797, + "step": 7063 + }, + { + "epoch": 0.91, + "grad_norm": 1.455986738204956, + "learning_rate": 2.3456282212342363e-07, + "loss": 0.6198, + "step": 7064 + }, + { + "epoch": 0.91, + "grad_norm": 1.5969009399414062, + "learning_rate": 2.3393513159919223e-07, + "loss": 0.6366, + "step": 7065 + }, + { + "epoch": 0.91, + "grad_norm": 1.3942619562149048, + "learning_rate": 2.3330826193276145e-07, + "loss": 0.5663, + "step": 7066 + }, + { + "epoch": 0.91, + "grad_norm": 2.943873405456543, + "learning_rate": 2.3268221323209494e-07, + "loss": 0.5417, + "step": 7067 + }, + { + "epoch": 0.91, + "grad_norm": 1.2764878273010254, + "learning_rate": 2.3205698560501755e-07, + "loss": 0.5953, + "step": 7068 + }, + { + "epoch": 0.91, + "grad_norm": 1.2757691144943237, + "learning_rate": 2.314325791592126e-07, + "loss": 0.5584, + "step": 7069 + }, + { + "epoch": 0.91, + "grad_norm": 1.3400564193725586, + "learning_rate": 2.3080899400222178e-07, + "loss": 0.5815, + "step": 7070 + }, + { + "epoch": 0.91, + "grad_norm": 1.1935240030288696, + "learning_rate": 2.301862302414437e-07, + "loss": 0.5787, + "step": 7071 + }, + { + "epoch": 0.91, + "grad_norm": 1.1886448860168457, + "learning_rate": 2.2956428798413755e-07, + "loss": 0.5357, + "step": 7072 + }, + { + "epoch": 0.91, + "grad_norm": 1.3621718883514404, + "learning_rate": 2.2894316733741985e-07, + "loss": 0.5345, + "step": 7073 + }, + { + "epoch": 0.91, + "grad_norm": 1.316648006439209, + "learning_rate": 2.2832286840826667e-07, + "loss": 0.5749, + "step": 7074 + }, + { + "epoch": 0.91, + "grad_norm": 1.4001599550247192, + "learning_rate": 2.277033913035115e-07, + "loss": 0.6113, + "step": 7075 + }, + { + "epoch": 0.91, + "grad_norm": 2.2548327445983887, + "learning_rate": 2.2708473612984617e-07, + "loss": 0.5458, + "step": 7076 + }, + { + "epoch": 0.91, + "grad_norm": 1.441633701324463, + "learning_rate": 2.2646690299382212e-07, + "loss": 0.5897, + "step": 7077 + }, + { + "epoch": 0.91, + "grad_norm": 1.5713400840759277, + "learning_rate": 2.258498920018476e-07, + "loss": 0.5499, + "step": 7078 + }, + { + "epoch": 0.91, + "grad_norm": 1.258521318435669, + "learning_rate": 2.2523370326019145e-07, + "loss": 0.5601, + "step": 7079 + }, + { + "epoch": 0.91, + "grad_norm": 1.3642133474349976, + "learning_rate": 2.246183368749799e-07, + "loss": 0.5553, + "step": 7080 + }, + { + "epoch": 0.91, + "grad_norm": 1.4287675619125366, + "learning_rate": 2.24003792952196e-07, + "loss": 0.6197, + "step": 7081 + }, + { + "epoch": 0.91, + "grad_norm": 1.6514580249786377, + "learning_rate": 2.233900715976828e-07, + "loss": 0.5636, + "step": 7082 + }, + { + "epoch": 0.91, + "grad_norm": 1.6768487691879272, + "learning_rate": 2.2277717291714184e-07, + "loss": 0.5855, + "step": 7083 + }, + { + "epoch": 0.91, + "grad_norm": 1.3704938888549805, + "learning_rate": 2.2216509701613265e-07, + "loss": 0.6517, + "step": 7084 + }, + { + "epoch": 0.91, + "grad_norm": 1.2699650526046753, + "learning_rate": 2.2155384400007196e-07, + "loss": 0.5112, + "step": 7085 + }, + { + "epoch": 0.91, + "grad_norm": 1.198685884475708, + "learning_rate": 2.2094341397423558e-07, + "loss": 0.6796, + "step": 7086 + }, + { + "epoch": 0.91, + "grad_norm": 1.5341168642044067, + "learning_rate": 2.2033380704375829e-07, + "loss": 0.5189, + "step": 7087 + }, + { + "epoch": 0.91, + "grad_norm": 1.5040671825408936, + "learning_rate": 2.1972502331363332e-07, + "loss": 0.6402, + "step": 7088 + }, + { + "epoch": 0.91, + "grad_norm": 1.403660774230957, + "learning_rate": 2.191170628887096e-07, + "loss": 0.5636, + "step": 7089 + }, + { + "epoch": 0.91, + "grad_norm": 1.2295106649398804, + "learning_rate": 2.1850992587369668e-07, + "loss": 0.5676, + "step": 7090 + }, + { + "epoch": 0.91, + "grad_norm": 1.3213706016540527, + "learning_rate": 2.1790361237316204e-07, + "loss": 0.5344, + "step": 7091 + }, + { + "epoch": 0.91, + "grad_norm": 1.5314936637878418, + "learning_rate": 2.1729812249153048e-07, + "loss": 0.6096, + "step": 7092 + }, + { + "epoch": 0.91, + "grad_norm": 1.497307300567627, + "learning_rate": 2.1669345633308526e-07, + "loss": 0.5312, + "step": 7093 + }, + { + "epoch": 0.91, + "grad_norm": 1.2101197242736816, + "learning_rate": 2.1608961400196747e-07, + "loss": 0.6864, + "step": 7094 + }, + { + "epoch": 0.91, + "grad_norm": 1.2792714834213257, + "learning_rate": 2.1548659560217678e-07, + "loss": 0.529, + "step": 7095 + }, + { + "epoch": 0.91, + "grad_norm": 1.6750179529190063, + "learning_rate": 2.148844012375717e-07, + "loss": 0.6093, + "step": 7096 + }, + { + "epoch": 0.91, + "grad_norm": 1.2162268161773682, + "learning_rate": 2.1428303101186708e-07, + "loss": 0.6036, + "step": 7097 + }, + { + "epoch": 0.91, + "grad_norm": 1.2044456005096436, + "learning_rate": 2.1368248502863676e-07, + "loss": 0.5956, + "step": 7098 + }, + { + "epoch": 0.91, + "grad_norm": 1.3525432348251343, + "learning_rate": 2.1308276339131407e-07, + "loss": 0.6101, + "step": 7099 + }, + { + "epoch": 0.91, + "grad_norm": 1.3939049243927002, + "learning_rate": 2.124838662031864e-07, + "loss": 0.5844, + "step": 7100 + }, + { + "epoch": 0.91, + "grad_norm": 2.0812456607818604, + "learning_rate": 2.1188579356740346e-07, + "loss": 0.5712, + "step": 7101 + }, + { + "epoch": 0.91, + "grad_norm": 1.490818738937378, + "learning_rate": 2.1128854558697009e-07, + "loss": 0.597, + "step": 7102 + }, + { + "epoch": 0.91, + "grad_norm": 1.2523460388183594, + "learning_rate": 2.1069212236475177e-07, + "loss": 0.7394, + "step": 7103 + }, + { + "epoch": 0.91, + "grad_norm": 1.3945302963256836, + "learning_rate": 2.1009652400346802e-07, + "loss": 0.57, + "step": 7104 + }, + { + "epoch": 0.91, + "grad_norm": 1.7118395566940308, + "learning_rate": 2.0950175060569956e-07, + "loss": 0.5877, + "step": 7105 + }, + { + "epoch": 0.91, + "grad_norm": 1.2491495609283447, + "learning_rate": 2.0890780227388385e-07, + "loss": 0.5995, + "step": 7106 + }, + { + "epoch": 0.91, + "grad_norm": 1.2308311462402344, + "learning_rate": 2.08314679110318e-07, + "loss": 0.6299, + "step": 7107 + }, + { + "epoch": 0.91, + "grad_norm": 1.270413875579834, + "learning_rate": 2.0772238121715248e-07, + "loss": 0.5532, + "step": 7108 + }, + { + "epoch": 0.91, + "grad_norm": 1.3391141891479492, + "learning_rate": 2.0713090869640072e-07, + "loss": 0.6113, + "step": 7109 + }, + { + "epoch": 0.91, + "grad_norm": 1.320900559425354, + "learning_rate": 2.0654026164993123e-07, + "loss": 0.565, + "step": 7110 + }, + { + "epoch": 0.91, + "grad_norm": 1.3125171661376953, + "learning_rate": 2.0595044017947153e-07, + "loss": 0.621, + "step": 7111 + }, + { + "epoch": 0.91, + "grad_norm": 1.7887307405471802, + "learning_rate": 2.053614443866042e-07, + "loss": 0.6705, + "step": 7112 + }, + { + "epoch": 0.91, + "grad_norm": 1.334878921508789, + "learning_rate": 2.0477327437277427e-07, + "loss": 0.6416, + "step": 7113 + }, + { + "epoch": 0.91, + "grad_norm": 1.0813583135604858, + "learning_rate": 2.041859302392818e-07, + "loss": 0.5807, + "step": 7114 + }, + { + "epoch": 0.91, + "grad_norm": 1.5186846256256104, + "learning_rate": 2.0359941208728363e-07, + "loss": 0.6429, + "step": 7115 + }, + { + "epoch": 0.91, + "grad_norm": 1.855786919593811, + "learning_rate": 2.0301372001779673e-07, + "loss": 0.664, + "step": 7116 + }, + { + "epoch": 0.91, + "grad_norm": 1.525266408920288, + "learning_rate": 2.0242885413169376e-07, + "loss": 0.618, + "step": 7117 + }, + { + "epoch": 0.91, + "grad_norm": 1.3486268520355225, + "learning_rate": 2.0184481452970694e-07, + "loss": 0.5815, + "step": 7118 + }, + { + "epoch": 0.91, + "grad_norm": 1.5850796699523926, + "learning_rate": 2.012616013124241e-07, + "loss": 0.5655, + "step": 7119 + }, + { + "epoch": 0.91, + "grad_norm": 1.358275055885315, + "learning_rate": 2.0067921458029272e-07, + "loss": 0.5076, + "step": 7120 + }, + { + "epoch": 0.91, + "grad_norm": 1.3294881582260132, + "learning_rate": 2.000976544336164e-07, + "loss": 0.6009, + "step": 7121 + }, + { + "epoch": 0.91, + "grad_norm": 1.6353390216827393, + "learning_rate": 1.9951692097255836e-07, + "loss": 0.5333, + "step": 7122 + }, + { + "epoch": 0.91, + "grad_norm": 1.490020990371704, + "learning_rate": 1.989370142971364e-07, + "loss": 0.6594, + "step": 7123 + }, + { + "epoch": 0.91, + "grad_norm": 1.7284151315689087, + "learning_rate": 1.983579345072284e-07, + "loss": 0.5152, + "step": 7124 + }, + { + "epoch": 0.91, + "grad_norm": 1.3690820932388306, + "learning_rate": 1.9777968170257012e-07, + "loss": 0.5566, + "step": 7125 + }, + { + "epoch": 0.91, + "grad_norm": 1.5744019746780396, + "learning_rate": 1.972022559827519e-07, + "loss": 0.6513, + "step": 7126 + }, + { + "epoch": 0.91, + "grad_norm": 2.656482696533203, + "learning_rate": 1.9662565744722472e-07, + "loss": 0.523, + "step": 7127 + }, + { + "epoch": 0.91, + "grad_norm": 2.3750336170196533, + "learning_rate": 1.9604988619529586e-07, + "loss": 0.5436, + "step": 7128 + }, + { + "epoch": 0.91, + "grad_norm": 1.3693758249282837, + "learning_rate": 1.954749423261304e-07, + "loss": 0.5961, + "step": 7129 + }, + { + "epoch": 0.91, + "grad_norm": 1.8211826086044312, + "learning_rate": 1.9490082593875026e-07, + "loss": 0.6507, + "step": 7130 + }, + { + "epoch": 0.91, + "grad_norm": 2.833078384399414, + "learning_rate": 1.9432753713203524e-07, + "loss": 0.5015, + "step": 7131 + }, + { + "epoch": 0.91, + "grad_norm": 1.487642526626587, + "learning_rate": 1.937550760047241e-07, + "loss": 0.5934, + "step": 7132 + }, + { + "epoch": 0.91, + "grad_norm": 1.3412609100341797, + "learning_rate": 1.9318344265540967e-07, + "loss": 0.564, + "step": 7133 + }, + { + "epoch": 0.91, + "grad_norm": 1.496861219406128, + "learning_rate": 1.926126371825454e-07, + "loss": 0.5433, + "step": 7134 + }, + { + "epoch": 0.91, + "grad_norm": 1.3020457029342651, + "learning_rate": 1.9204265968444047e-07, + "loss": 0.4675, + "step": 7135 + }, + { + "epoch": 0.91, + "grad_norm": 1.3243365287780762, + "learning_rate": 1.9147351025926242e-07, + "loss": 0.583, + "step": 7136 + }, + { + "epoch": 0.91, + "grad_norm": 1.173129677772522, + "learning_rate": 1.9090518900503508e-07, + "loss": 0.6188, + "step": 7137 + }, + { + "epoch": 0.91, + "grad_norm": 1.4048100709915161, + "learning_rate": 1.9033769601964013e-07, + "loss": 0.5896, + "step": 7138 + }, + { + "epoch": 0.91, + "grad_norm": 1.206529140472412, + "learning_rate": 1.8977103140081664e-07, + "loss": 0.5598, + "step": 7139 + }, + { + "epoch": 0.91, + "grad_norm": 1.4039027690887451, + "learning_rate": 1.8920519524616265e-07, + "loss": 0.5779, + "step": 7140 + }, + { + "epoch": 0.91, + "grad_norm": 1.485414743423462, + "learning_rate": 1.8864018765312963e-07, + "loss": 0.5744, + "step": 7141 + }, + { + "epoch": 0.91, + "grad_norm": 1.6506009101867676, + "learning_rate": 1.880760087190303e-07, + "loss": 0.5637, + "step": 7142 + }, + { + "epoch": 0.92, + "grad_norm": 1.588201642036438, + "learning_rate": 1.8751265854103197e-07, + "loss": 0.4825, + "step": 7143 + }, + { + "epoch": 0.92, + "grad_norm": 1.8149200677871704, + "learning_rate": 1.869501372161614e-07, + "loss": 0.5513, + "step": 7144 + }, + { + "epoch": 0.92, + "grad_norm": 2.2272982597351074, + "learning_rate": 1.8638844484130058e-07, + "loss": 0.5979, + "step": 7145 + }, + { + "epoch": 0.92, + "grad_norm": 1.5402497053146362, + "learning_rate": 1.858275815131888e-07, + "loss": 0.5783, + "step": 7146 + }, + { + "epoch": 0.92, + "grad_norm": 1.4437787532806396, + "learning_rate": 1.85267547328426e-07, + "loss": 0.5892, + "step": 7147 + }, + { + "epoch": 0.92, + "grad_norm": 1.1562960147857666, + "learning_rate": 1.8470834238346448e-07, + "loss": 0.5223, + "step": 7148 + }, + { + "epoch": 0.92, + "grad_norm": 1.4266095161437988, + "learning_rate": 1.8414996677461605e-07, + "loss": 0.6465, + "step": 7149 + }, + { + "epoch": 0.92, + "grad_norm": 1.4627182483673096, + "learning_rate": 1.8359242059805048e-07, + "loss": 0.6431, + "step": 7150 + }, + { + "epoch": 0.92, + "grad_norm": 1.4455598592758179, + "learning_rate": 1.8303570394979375e-07, + "loss": 0.6584, + "step": 7151 + }, + { + "epoch": 0.92, + "grad_norm": 1.3495609760284424, + "learning_rate": 1.8247981692572802e-07, + "loss": 0.6074, + "step": 7152 + }, + { + "epoch": 0.92, + "grad_norm": 1.1152602434158325, + "learning_rate": 1.8192475962159395e-07, + "loss": 0.4656, + "step": 7153 + }, + { + "epoch": 0.92, + "grad_norm": 1.3456722497940063, + "learning_rate": 1.8137053213298895e-07, + "loss": 0.5437, + "step": 7154 + }, + { + "epoch": 0.92, + "grad_norm": 1.8487489223480225, + "learning_rate": 1.808171345553683e-07, + "loss": 0.5297, + "step": 7155 + }, + { + "epoch": 0.92, + "grad_norm": 1.3239003419876099, + "learning_rate": 1.8026456698404192e-07, + "loss": 0.5854, + "step": 7156 + }, + { + "epoch": 0.92, + "grad_norm": 1.531930923461914, + "learning_rate": 1.7971282951417923e-07, + "loss": 0.5994, + "step": 7157 + }, + { + "epoch": 0.92, + "grad_norm": 1.3221821784973145, + "learning_rate": 1.7916192224080586e-07, + "loss": 0.5434, + "step": 7158 + }, + { + "epoch": 0.92, + "grad_norm": 1.7776752710342407, + "learning_rate": 1.7861184525880425e-07, + "loss": 0.6325, + "step": 7159 + }, + { + "epoch": 0.92, + "grad_norm": 1.3057001829147339, + "learning_rate": 1.7806259866291366e-07, + "loss": 0.548, + "step": 7160 + }, + { + "epoch": 0.92, + "grad_norm": 1.2270419597625732, + "learning_rate": 1.775141825477311e-07, + "loss": 0.5853, + "step": 7161 + }, + { + "epoch": 0.92, + "grad_norm": 1.3529689311981201, + "learning_rate": 1.7696659700770948e-07, + "loss": 0.5753, + "step": 7162 + }, + { + "epoch": 0.92, + "grad_norm": 1.1730376482009888, + "learning_rate": 1.764198421371599e-07, + "loss": 0.5227, + "step": 7163 + }, + { + "epoch": 0.92, + "grad_norm": 1.4664965867996216, + "learning_rate": 1.758739180302499e-07, + "loss": 0.5583, + "step": 7164 + }, + { + "epoch": 0.92, + "grad_norm": 1.7074847221374512, + "learning_rate": 1.753288247810031e-07, + "loss": 0.6546, + "step": 7165 + }, + { + "epoch": 0.92, + "grad_norm": 1.511422872543335, + "learning_rate": 1.7478456248330166e-07, + "loss": 0.6486, + "step": 7166 + }, + { + "epoch": 0.92, + "grad_norm": 1.5393157005310059, + "learning_rate": 1.7424113123088282e-07, + "loss": 0.6168, + "step": 7167 + }, + { + "epoch": 0.92, + "grad_norm": 2.8663549423217773, + "learning_rate": 1.736985311173417e-07, + "loss": 0.6082, + "step": 7168 + }, + { + "epoch": 0.92, + "grad_norm": 1.5243266820907593, + "learning_rate": 1.7315676223613077e-07, + "loss": 0.5553, + "step": 7169 + }, + { + "epoch": 0.92, + "grad_norm": 1.2957146167755127, + "learning_rate": 1.7261582468055872e-07, + "loss": 0.5589, + "step": 7170 + }, + { + "epoch": 0.92, + "grad_norm": 1.3966628313064575, + "learning_rate": 1.720757185437899e-07, + "loss": 0.5563, + "step": 7171 + }, + { + "epoch": 0.92, + "grad_norm": 1.2726695537567139, + "learning_rate": 1.7153644391884715e-07, + "loss": 0.6512, + "step": 7172 + }, + { + "epoch": 0.92, + "grad_norm": 1.4725221395492554, + "learning_rate": 1.7099800089861006e-07, + "loss": 0.5772, + "step": 7173 + }, + { + "epoch": 0.92, + "grad_norm": 1.5598526000976562, + "learning_rate": 1.7046038957581447e-07, + "loss": 0.6274, + "step": 7174 + }, + { + "epoch": 0.92, + "grad_norm": 2.330385684967041, + "learning_rate": 1.6992361004305235e-07, + "loss": 0.5829, + "step": 7175 + }, + { + "epoch": 0.92, + "grad_norm": 1.331067681312561, + "learning_rate": 1.693876623927737e-07, + "loss": 0.5583, + "step": 7176 + }, + { + "epoch": 0.92, + "grad_norm": 1.3554342985153198, + "learning_rate": 1.6885254671728468e-07, + "loss": 0.5674, + "step": 7177 + }, + { + "epoch": 0.92, + "grad_norm": 1.254412055015564, + "learning_rate": 1.683182631087471e-07, + "loss": 0.6022, + "step": 7178 + }, + { + "epoch": 0.92, + "grad_norm": 1.508400321006775, + "learning_rate": 1.677848116591807e-07, + "loss": 0.5477, + "step": 7179 + }, + { + "epoch": 0.92, + "grad_norm": 1.2750962972640991, + "learning_rate": 1.6725219246046254e-07, + "loss": 0.5805, + "step": 7180 + }, + { + "epoch": 0.92, + "grad_norm": 1.5220352411270142, + "learning_rate": 1.6672040560432533e-07, + "loss": 0.6261, + "step": 7181 + }, + { + "epoch": 0.92, + "grad_norm": 1.256548523902893, + "learning_rate": 1.661894511823575e-07, + "loss": 0.5502, + "step": 7182 + }, + { + "epoch": 0.92, + "grad_norm": 1.461651086807251, + "learning_rate": 1.6565932928600593e-07, + "loss": 0.4907, + "step": 7183 + }, + { + "epoch": 0.92, + "grad_norm": 1.1890257596969604, + "learning_rate": 1.6513004000657418e-07, + "loss": 0.5885, + "step": 7184 + }, + { + "epoch": 0.92, + "grad_norm": 1.4478956460952759, + "learning_rate": 1.6460158343521993e-07, + "loss": 0.5186, + "step": 7185 + }, + { + "epoch": 0.92, + "grad_norm": 1.0976368188858032, + "learning_rate": 1.640739596629598e-07, + "loss": 0.7179, + "step": 7186 + }, + { + "epoch": 0.92, + "grad_norm": 1.2835136651992798, + "learning_rate": 1.6354716878066612e-07, + "loss": 0.5308, + "step": 7187 + }, + { + "epoch": 0.92, + "grad_norm": 1.4048012495040894, + "learning_rate": 1.6302121087906854e-07, + "loss": 0.6106, + "step": 7188 + }, + { + "epoch": 0.92, + "grad_norm": 1.2641879320144653, + "learning_rate": 1.6249608604875177e-07, + "loss": 0.6223, + "step": 7189 + }, + { + "epoch": 0.92, + "grad_norm": 1.7437182664871216, + "learning_rate": 1.6197179438015798e-07, + "loss": 0.5893, + "step": 7190 + }, + { + "epoch": 0.92, + "grad_norm": 1.3747304677963257, + "learning_rate": 1.6144833596358656e-07, + "loss": 0.5148, + "step": 7191 + }, + { + "epoch": 0.92, + "grad_norm": 1.4542993307113647, + "learning_rate": 1.6092571088919205e-07, + "loss": 0.5563, + "step": 7192 + }, + { + "epoch": 0.92, + "grad_norm": 1.3954862356185913, + "learning_rate": 1.6040391924698584e-07, + "loss": 0.6165, + "step": 7193 + }, + { + "epoch": 0.92, + "grad_norm": 1.301512598991394, + "learning_rate": 1.5988296112683598e-07, + "loss": 0.5522, + "step": 7194 + }, + { + "epoch": 0.92, + "grad_norm": 1.3332188129425049, + "learning_rate": 1.5936283661846686e-07, + "loss": 0.5854, + "step": 7195 + }, + { + "epoch": 0.92, + "grad_norm": 1.3937634229660034, + "learning_rate": 1.588435458114601e-07, + "loss": 0.6015, + "step": 7196 + }, + { + "epoch": 0.92, + "grad_norm": 1.4809768199920654, + "learning_rate": 1.5832508879525143e-07, + "loss": 0.4986, + "step": 7197 + }, + { + "epoch": 0.92, + "grad_norm": 1.265257477760315, + "learning_rate": 1.5780746565913552e-07, + "loss": 0.6384, + "step": 7198 + }, + { + "epoch": 0.92, + "grad_norm": 1.1740734577178955, + "learning_rate": 1.5729067649226327e-07, + "loss": 0.5811, + "step": 7199 + }, + { + "epoch": 0.92, + "grad_norm": 1.4921587705612183, + "learning_rate": 1.5677472138364014e-07, + "loss": 0.5796, + "step": 7200 + }, + { + "epoch": 0.92, + "grad_norm": 1.2573281526565552, + "learning_rate": 1.562596004221284e-07, + "loss": 0.5726, + "step": 7201 + }, + { + "epoch": 0.92, + "grad_norm": 1.4946744441986084, + "learning_rate": 1.557453136964482e-07, + "loss": 0.568, + "step": 7202 + }, + { + "epoch": 0.92, + "grad_norm": 1.1617540121078491, + "learning_rate": 1.5523186129517475e-07, + "loss": 0.5592, + "step": 7203 + }, + { + "epoch": 0.92, + "grad_norm": 1.2968569993972778, + "learning_rate": 1.547192433067396e-07, + "loss": 0.5903, + "step": 7204 + }, + { + "epoch": 0.92, + "grad_norm": 1.3031530380249023, + "learning_rate": 1.5420745981943042e-07, + "loss": 0.5648, + "step": 7205 + }, + { + "epoch": 0.92, + "grad_norm": 1.2497785091400146, + "learning_rate": 1.5369651092139172e-07, + "loss": 0.5518, + "step": 7206 + }, + { + "epoch": 0.92, + "grad_norm": 1.333309531211853, + "learning_rate": 1.531863967006253e-07, + "loss": 0.5962, + "step": 7207 + }, + { + "epoch": 0.92, + "grad_norm": 1.2219014167785645, + "learning_rate": 1.526771172449859e-07, + "loss": 0.6769, + "step": 7208 + }, + { + "epoch": 0.92, + "grad_norm": 1.6595335006713867, + "learning_rate": 1.5216867264218726e-07, + "loss": 0.5918, + "step": 7209 + }, + { + "epoch": 0.92, + "grad_norm": 1.4741884469985962, + "learning_rate": 1.516610629797993e-07, + "loss": 0.6679, + "step": 7210 + }, + { + "epoch": 0.92, + "grad_norm": 1.4178804159164429, + "learning_rate": 1.511542883452477e-07, + "loss": 0.592, + "step": 7211 + }, + { + "epoch": 0.92, + "grad_norm": 1.6573342084884644, + "learning_rate": 1.5064834882581314e-07, + "loss": 0.6468, + "step": 7212 + }, + { + "epoch": 0.92, + "grad_norm": 1.2453925609588623, + "learning_rate": 1.5014324450863316e-07, + "loss": 0.5826, + "step": 7213 + }, + { + "epoch": 0.92, + "grad_norm": 2.3448381423950195, + "learning_rate": 1.496389754807026e-07, + "loss": 0.6063, + "step": 7214 + }, + { + "epoch": 0.92, + "grad_norm": 1.3461638689041138, + "learning_rate": 1.4913554182887147e-07, + "loss": 0.6598, + "step": 7215 + }, + { + "epoch": 0.92, + "grad_norm": 1.6426374912261963, + "learning_rate": 1.4863294363984536e-07, + "loss": 0.6191, + "step": 7216 + }, + { + "epoch": 0.92, + "grad_norm": 1.748055100440979, + "learning_rate": 1.481311810001873e-07, + "loss": 0.6522, + "step": 7217 + }, + { + "epoch": 0.92, + "grad_norm": 1.689281702041626, + "learning_rate": 1.4763025399631535e-07, + "loss": 0.6215, + "step": 7218 + }, + { + "epoch": 0.92, + "grad_norm": 1.246228575706482, + "learning_rate": 1.4713016271450443e-07, + "loss": 0.5989, + "step": 7219 + }, + { + "epoch": 0.92, + "grad_norm": 2.082564353942871, + "learning_rate": 1.466309072408839e-07, + "loss": 0.6292, + "step": 7220 + }, + { + "epoch": 0.93, + "grad_norm": 1.7110521793365479, + "learning_rate": 1.4613248766144172e-07, + "loss": 0.5804, + "step": 7221 + }, + { + "epoch": 0.93, + "grad_norm": 1.2282060384750366, + "learning_rate": 1.456349040620203e-07, + "loss": 0.6041, + "step": 7222 + }, + { + "epoch": 0.93, + "grad_norm": 1.646616816520691, + "learning_rate": 1.4513815652831776e-07, + "loss": 0.604, + "step": 7223 + }, + { + "epoch": 0.93, + "grad_norm": 1.2719740867614746, + "learning_rate": 1.4464224514588842e-07, + "loss": 0.6009, + "step": 7224 + }, + { + "epoch": 0.93, + "grad_norm": 1.63080632686615, + "learning_rate": 1.4414717000014456e-07, + "loss": 0.5949, + "step": 7225 + }, + { + "epoch": 0.93, + "grad_norm": 1.7271970510482788, + "learning_rate": 1.436529311763507e-07, + "loss": 0.5815, + "step": 7226 + }, + { + "epoch": 0.93, + "grad_norm": 1.3970266580581665, + "learning_rate": 1.43159528759631e-07, + "loss": 0.5571, + "step": 7227 + }, + { + "epoch": 0.93, + "grad_norm": 1.2534281015396118, + "learning_rate": 1.426669628349636e-07, + "loss": 0.6054, + "step": 7228 + }, + { + "epoch": 0.93, + "grad_norm": 1.373757004737854, + "learning_rate": 1.4217523348718287e-07, + "loss": 0.6324, + "step": 7229 + }, + { + "epoch": 0.93, + "grad_norm": 1.6304547786712646, + "learning_rate": 1.4168434080097937e-07, + "loss": 0.6204, + "step": 7230 + }, + { + "epoch": 0.93, + "grad_norm": 1.140513300895691, + "learning_rate": 1.411942848608988e-07, + "loss": 0.6375, + "step": 7231 + }, + { + "epoch": 0.93, + "grad_norm": 1.1663187742233276, + "learning_rate": 1.4070506575134367e-07, + "loss": 0.5516, + "step": 7232 + }, + { + "epoch": 0.93, + "grad_norm": 1.328594446182251, + "learning_rate": 1.4021668355657215e-07, + "loss": 0.5821, + "step": 7233 + }, + { + "epoch": 0.93, + "grad_norm": 1.4575828313827515, + "learning_rate": 1.3972913836069857e-07, + "loss": 0.6476, + "step": 7234 + }, + { + "epoch": 0.93, + "grad_norm": 1.5077722072601318, + "learning_rate": 1.3924243024769135e-07, + "loss": 0.5528, + "step": 7235 + }, + { + "epoch": 0.93, + "grad_norm": 1.1910840272903442, + "learning_rate": 1.387565593013779e-07, + "loss": 0.5179, + "step": 7236 + }, + { + "epoch": 0.93, + "grad_norm": 1.430280089378357, + "learning_rate": 1.3827152560543843e-07, + "loss": 0.6201, + "step": 7237 + }, + { + "epoch": 0.93, + "grad_norm": 1.1076302528381348, + "learning_rate": 1.377873292434101e-07, + "loss": 0.6917, + "step": 7238 + }, + { + "epoch": 0.93, + "grad_norm": 1.003353476524353, + "learning_rate": 1.3730397029868615e-07, + "loss": 0.559, + "step": 7239 + }, + { + "epoch": 0.93, + "grad_norm": 1.5710481405258179, + "learning_rate": 1.3682144885451555e-07, + "loss": 0.5213, + "step": 7240 + }, + { + "epoch": 0.93, + "grad_norm": 1.5206317901611328, + "learning_rate": 1.3633976499400235e-07, + "loss": 0.59, + "step": 7241 + }, + { + "epoch": 0.93, + "grad_norm": 1.4319339990615845, + "learning_rate": 1.3585891880010747e-07, + "loss": 0.6111, + "step": 7242 + }, + { + "epoch": 0.93, + "grad_norm": 1.376744031906128, + "learning_rate": 1.3537891035564576e-07, + "loss": 0.6414, + "step": 7243 + }, + { + "epoch": 0.93, + "grad_norm": 1.5035916566848755, + "learning_rate": 1.3489973974329053e-07, + "loss": 0.732, + "step": 7244 + }, + { + "epoch": 0.93, + "grad_norm": 1.226749062538147, + "learning_rate": 1.3442140704556794e-07, + "loss": 0.5514, + "step": 7245 + }, + { + "epoch": 0.93, + "grad_norm": 1.0988937616348267, + "learning_rate": 1.3394391234486104e-07, + "loss": 0.5946, + "step": 7246 + }, + { + "epoch": 0.93, + "grad_norm": 1.2332367897033691, + "learning_rate": 1.3346725572340903e-07, + "loss": 0.5686, + "step": 7247 + }, + { + "epoch": 0.93, + "grad_norm": 1.4782696962356567, + "learning_rate": 1.3299143726330676e-07, + "loss": 0.6448, + "step": 7248 + }, + { + "epoch": 0.93, + "grad_norm": 1.3674440383911133, + "learning_rate": 1.3251645704650262e-07, + "loss": 0.6015, + "step": 7249 + }, + { + "epoch": 0.93, + "grad_norm": 1.3126945495605469, + "learning_rate": 1.3204231515480447e-07, + "loss": 0.6499, + "step": 7250 + }, + { + "epoch": 0.93, + "grad_norm": 1.5984584093093872, + "learning_rate": 1.3156901166987258e-07, + "loss": 0.6032, + "step": 7251 + }, + { + "epoch": 0.93, + "grad_norm": 1.3169101476669312, + "learning_rate": 1.3109654667322337e-07, + "loss": 0.5563, + "step": 7252 + }, + { + "epoch": 0.93, + "grad_norm": 1.4062318801879883, + "learning_rate": 1.306249202462301e-07, + "loss": 0.6181, + "step": 7253 + }, + { + "epoch": 0.93, + "grad_norm": 1.340091347694397, + "learning_rate": 1.3015413247012054e-07, + "loss": 0.592, + "step": 7254 + }, + { + "epoch": 0.93, + "grad_norm": 1.2546271085739136, + "learning_rate": 1.2968418342597876e-07, + "loss": 0.6095, + "step": 7255 + }, + { + "epoch": 0.93, + "grad_norm": 1.1093549728393555, + "learning_rate": 1.2921507319474337e-07, + "loss": 0.7472, + "step": 7256 + }, + { + "epoch": 0.93, + "grad_norm": 1.3403831720352173, + "learning_rate": 1.2874680185720857e-07, + "loss": 0.484, + "step": 7257 + }, + { + "epoch": 0.93, + "grad_norm": 1.4152884483337402, + "learning_rate": 1.2827936949402598e-07, + "loss": 0.6114, + "step": 7258 + }, + { + "epoch": 0.93, + "grad_norm": 1.3531930446624756, + "learning_rate": 1.2781277618570066e-07, + "loss": 0.5642, + "step": 7259 + }, + { + "epoch": 0.93, + "grad_norm": 1.5098471641540527, + "learning_rate": 1.273470220125933e-07, + "loss": 0.5642, + "step": 7260 + }, + { + "epoch": 0.93, + "grad_norm": 1.4057996273040771, + "learning_rate": 1.2688210705492088e-07, + "loss": 0.6116, + "step": 7261 + }, + { + "epoch": 0.93, + "grad_norm": 1.2736021280288696, + "learning_rate": 1.26418031392756e-07, + "loss": 0.6281, + "step": 7262 + }, + { + "epoch": 0.93, + "grad_norm": 1.0702874660491943, + "learning_rate": 1.2595479510602583e-07, + "loss": 0.6903, + "step": 7263 + }, + { + "epoch": 0.93, + "grad_norm": 1.287724494934082, + "learning_rate": 1.2549239827451375e-07, + "loss": 0.5868, + "step": 7264 + }, + { + "epoch": 0.93, + "grad_norm": 1.1175434589385986, + "learning_rate": 1.2503084097785777e-07, + "loss": 0.6007, + "step": 7265 + }, + { + "epoch": 0.93, + "grad_norm": 1.1822404861450195, + "learning_rate": 1.2457012329555206e-07, + "loss": 0.5906, + "step": 7266 + }, + { + "epoch": 0.93, + "grad_norm": 1.4445996284484863, + "learning_rate": 1.2411024530694537e-07, + "loss": 0.6044, + "step": 7267 + }, + { + "epoch": 0.93, + "grad_norm": 1.1158324480056763, + "learning_rate": 1.2365120709124322e-07, + "loss": 0.6941, + "step": 7268 + }, + { + "epoch": 0.93, + "grad_norm": 1.6972533464431763, + "learning_rate": 1.2319300872750518e-07, + "loss": 0.6134, + "step": 7269 + }, + { + "epoch": 0.93, + "grad_norm": 1.1953438520431519, + "learning_rate": 1.2273565029464694e-07, + "loss": 0.7026, + "step": 7270 + }, + { + "epoch": 0.93, + "grad_norm": 1.5305699110031128, + "learning_rate": 1.2227913187143835e-07, + "loss": 0.547, + "step": 7271 + }, + { + "epoch": 0.93, + "grad_norm": 1.451676607131958, + "learning_rate": 1.2182345353650592e-07, + "loss": 0.6202, + "step": 7272 + }, + { + "epoch": 0.93, + "grad_norm": 1.5434238910675049, + "learning_rate": 1.2136861536833134e-07, + "loss": 0.6061, + "step": 7273 + }, + { + "epoch": 0.93, + "grad_norm": 1.2015151977539062, + "learning_rate": 1.209146174452508e-07, + "loss": 0.4739, + "step": 7274 + }, + { + "epoch": 0.93, + "grad_norm": 1.211741328239441, + "learning_rate": 1.2046145984545565e-07, + "loss": 0.6157, + "step": 7275 + }, + { + "epoch": 0.93, + "grad_norm": 1.2866265773773193, + "learning_rate": 1.2000914264699403e-07, + "loss": 0.5505, + "step": 7276 + }, + { + "epoch": 0.93, + "grad_norm": 1.437593698501587, + "learning_rate": 1.1955766592776863e-07, + "loss": 0.5961, + "step": 7277 + }, + { + "epoch": 0.93, + "grad_norm": 1.3555943965911865, + "learning_rate": 1.1910702976553612e-07, + "loss": 0.642, + "step": 7278 + }, + { + "epoch": 0.93, + "grad_norm": 1.1852178573608398, + "learning_rate": 1.1865723423790999e-07, + "loss": 0.5775, + "step": 7279 + }, + { + "epoch": 0.93, + "grad_norm": 1.3030401468276978, + "learning_rate": 1.1820827942235824e-07, + "loss": 0.5631, + "step": 7280 + }, + { + "epoch": 0.93, + "grad_norm": 1.204147219657898, + "learning_rate": 1.177601653962046e-07, + "loss": 0.733, + "step": 7281 + }, + { + "epoch": 0.93, + "grad_norm": 1.1808563470840454, + "learning_rate": 1.1731289223662679e-07, + "loss": 0.5254, + "step": 7282 + }, + { + "epoch": 0.93, + "grad_norm": 1.5205473899841309, + "learning_rate": 1.1686646002065815e-07, + "loss": 0.586, + "step": 7283 + }, + { + "epoch": 0.93, + "grad_norm": 1.6285957098007202, + "learning_rate": 1.1642086882519e-07, + "loss": 0.6336, + "step": 7284 + }, + { + "epoch": 0.93, + "grad_norm": 1.5329946279525757, + "learning_rate": 1.1597611872696368e-07, + "loss": 0.6124, + "step": 7285 + }, + { + "epoch": 0.93, + "grad_norm": 1.4156322479248047, + "learning_rate": 1.1553220980257962e-07, + "loss": 0.597, + "step": 7286 + }, + { + "epoch": 0.93, + "grad_norm": 1.3494125604629517, + "learning_rate": 1.150891421284922e-07, + "loss": 0.5883, + "step": 7287 + }, + { + "epoch": 0.93, + "grad_norm": 1.2939672470092773, + "learning_rate": 1.146469157810104e-07, + "loss": 0.6399, + "step": 7288 + }, + { + "epoch": 0.93, + "grad_norm": 3.1361777782440186, + "learning_rate": 1.1420553083629882e-07, + "loss": 0.6024, + "step": 7289 + }, + { + "epoch": 0.93, + "grad_norm": 1.2731001377105713, + "learning_rate": 1.1376498737037722e-07, + "loss": 0.5837, + "step": 7290 + }, + { + "epoch": 0.93, + "grad_norm": 1.1375939846038818, + "learning_rate": 1.1332528545911992e-07, + "loss": 0.6947, + "step": 7291 + }, + { + "epoch": 0.93, + "grad_norm": 1.776005744934082, + "learning_rate": 1.1288642517825688e-07, + "loss": 0.5576, + "step": 7292 + }, + { + "epoch": 0.93, + "grad_norm": 1.6414631605148315, + "learning_rate": 1.1244840660337264e-07, + "loss": 0.5971, + "step": 7293 + }, + { + "epoch": 0.93, + "grad_norm": 1.1578058004379272, + "learning_rate": 1.1201122980990742e-07, + "loss": 0.7463, + "step": 7294 + }, + { + "epoch": 0.93, + "grad_norm": 1.6849491596221924, + "learning_rate": 1.1157489487315542e-07, + "loss": 0.581, + "step": 7295 + }, + { + "epoch": 0.93, + "grad_norm": 1.2874782085418701, + "learning_rate": 1.1113940186826767e-07, + "loss": 0.5576, + "step": 7296 + }, + { + "epoch": 0.93, + "grad_norm": 1.0534873008728027, + "learning_rate": 1.1070475087024745e-07, + "loss": 0.6968, + "step": 7297 + }, + { + "epoch": 0.93, + "grad_norm": 1.308283805847168, + "learning_rate": 1.1027094195395548e-07, + "loss": 0.5559, + "step": 7298 + }, + { + "epoch": 0.94, + "grad_norm": 1.334275245666504, + "learning_rate": 1.0983797519410588e-07, + "loss": 0.6184, + "step": 7299 + }, + { + "epoch": 0.94, + "grad_norm": 1.2813701629638672, + "learning_rate": 1.0940585066526899e-07, + "loss": 0.587, + "step": 7300 + }, + { + "epoch": 0.94, + "grad_norm": 1.355509638786316, + "learning_rate": 1.0897456844186971e-07, + "loss": 0.6175, + "step": 7301 + }, + { + "epoch": 0.94, + "grad_norm": 1.4116696119308472, + "learning_rate": 1.0854412859818642e-07, + "loss": 0.5073, + "step": 7302 + }, + { + "epoch": 0.94, + "grad_norm": 1.340970516204834, + "learning_rate": 1.0811453120835535e-07, + "loss": 0.5772, + "step": 7303 + }, + { + "epoch": 0.94, + "grad_norm": 1.5687137842178345, + "learning_rate": 1.0768577634636457e-07, + "loss": 0.5782, + "step": 7304 + }, + { + "epoch": 0.94, + "grad_norm": 1.321331262588501, + "learning_rate": 1.0725786408605887e-07, + "loss": 0.5902, + "step": 7305 + }, + { + "epoch": 0.94, + "grad_norm": 1.3391438722610474, + "learning_rate": 1.068307945011371e-07, + "loss": 0.6029, + "step": 7306 + }, + { + "epoch": 0.94, + "grad_norm": 1.2736539840698242, + "learning_rate": 1.0640456766515428e-07, + "loss": 0.6101, + "step": 7307 + }, + { + "epoch": 0.94, + "grad_norm": 1.5165783166885376, + "learning_rate": 1.0597918365151838e-07, + "loss": 0.5793, + "step": 7308 + }, + { + "epoch": 0.94, + "grad_norm": 1.2305033206939697, + "learning_rate": 1.0555464253349301e-07, + "loss": 0.5675, + "step": 7309 + }, + { + "epoch": 0.94, + "grad_norm": 1.2480313777923584, + "learning_rate": 1.0513094438419747e-07, + "loss": 0.518, + "step": 7310 + }, + { + "epoch": 0.94, + "grad_norm": 1.5531625747680664, + "learning_rate": 1.0470808927660503e-07, + "loss": 0.6093, + "step": 7311 + }, + { + "epoch": 0.94, + "grad_norm": 1.1417747735977173, + "learning_rate": 1.0428607728354301e-07, + "loss": 0.7112, + "step": 7312 + }, + { + "epoch": 0.94, + "grad_norm": 1.267432689666748, + "learning_rate": 1.0386490847769547e-07, + "loss": 0.6231, + "step": 7313 + }, + { + "epoch": 0.94, + "grad_norm": 1.4412930011749268, + "learning_rate": 1.0344458293159998e-07, + "loss": 0.6256, + "step": 7314 + }, + { + "epoch": 0.94, + "grad_norm": 1.665556788444519, + "learning_rate": 1.030251007176486e-07, + "loss": 0.5757, + "step": 7315 + }, + { + "epoch": 0.94, + "grad_norm": 1.5385949611663818, + "learning_rate": 1.026064619080891e-07, + "loss": 0.657, + "step": 7316 + }, + { + "epoch": 0.94, + "grad_norm": 1.289075255393982, + "learning_rate": 1.0218866657502268e-07, + "loss": 0.6306, + "step": 7317 + }, + { + "epoch": 0.94, + "grad_norm": 1.4588640928268433, + "learning_rate": 1.0177171479040737e-07, + "loss": 0.5182, + "step": 7318 + }, + { + "epoch": 0.94, + "grad_norm": 1.3852373361587524, + "learning_rate": 1.0135560662605404e-07, + "loss": 0.5512, + "step": 7319 + }, + { + "epoch": 0.94, + "grad_norm": 1.2439759969711304, + "learning_rate": 1.0094034215362814e-07, + "loss": 0.639, + "step": 7320 + }, + { + "epoch": 0.94, + "grad_norm": 1.4485936164855957, + "learning_rate": 1.0052592144465189e-07, + "loss": 0.6265, + "step": 7321 + }, + { + "epoch": 0.94, + "grad_norm": 1.1735234260559082, + "learning_rate": 1.0011234457050045e-07, + "loss": 0.7429, + "step": 7322 + }, + { + "epoch": 0.94, + "grad_norm": 1.1867866516113281, + "learning_rate": 9.969961160240294e-08, + "loss": 0.563, + "step": 7323 + }, + { + "epoch": 0.94, + "grad_norm": 1.3140125274658203, + "learning_rate": 9.928772261144526e-08, + "loss": 0.5963, + "step": 7324 + }, + { + "epoch": 0.94, + "grad_norm": 1.2439604997634888, + "learning_rate": 9.887667766856678e-08, + "loss": 0.6702, + "step": 7325 + }, + { + "epoch": 0.94, + "grad_norm": 1.1434909105300903, + "learning_rate": 9.846647684456146e-08, + "loss": 0.5945, + "step": 7326 + }, + { + "epoch": 0.94, + "grad_norm": 1.1893295049667358, + "learning_rate": 9.805712021007774e-08, + "loss": 0.5411, + "step": 7327 + }, + { + "epoch": 0.94, + "grad_norm": 1.2865269184112549, + "learning_rate": 9.764860783561981e-08, + "loss": 0.676, + "step": 7328 + }, + { + "epoch": 0.94, + "grad_norm": 1.5296525955200195, + "learning_rate": 9.724093979154526e-08, + "loss": 0.5856, + "step": 7329 + }, + { + "epoch": 0.94, + "grad_norm": 1.277660608291626, + "learning_rate": 9.683411614806626e-08, + "loss": 0.6107, + "step": 7330 + }, + { + "epoch": 0.94, + "grad_norm": 1.141019344329834, + "learning_rate": 9.642813697525011e-08, + "loss": 0.5483, + "step": 7331 + }, + { + "epoch": 0.94, + "grad_norm": 1.5347297191619873, + "learning_rate": 9.602300234301865e-08, + "loss": 0.6413, + "step": 7332 + }, + { + "epoch": 0.94, + "grad_norm": 1.3887579441070557, + "learning_rate": 9.561871232114827e-08, + "loss": 0.5488, + "step": 7333 + }, + { + "epoch": 0.94, + "grad_norm": 1.2093347311019897, + "learning_rate": 9.521526697926887e-08, + "loss": 0.6469, + "step": 7334 + }, + { + "epoch": 0.94, + "grad_norm": 1.4027270078659058, + "learning_rate": 9.481266638686592e-08, + "loss": 0.5732, + "step": 7335 + }, + { + "epoch": 0.94, + "grad_norm": 1.4226722717285156, + "learning_rate": 9.441091061328067e-08, + "loss": 0.5628, + "step": 7336 + }, + { + "epoch": 0.94, + "grad_norm": 1.2715821266174316, + "learning_rate": 9.400999972770553e-08, + "loss": 0.5455, + "step": 7337 + }, + { + "epoch": 0.94, + "grad_norm": 1.5981942415237427, + "learning_rate": 9.360993379919026e-08, + "loss": 0.5994, + "step": 7338 + }, + { + "epoch": 0.94, + "grad_norm": 1.1296770572662354, + "learning_rate": 9.321071289663753e-08, + "loss": 0.6579, + "step": 7339 + }, + { + "epoch": 0.94, + "grad_norm": 1.32581627368927, + "learning_rate": 9.281233708880566e-08, + "loss": 0.557, + "step": 7340 + }, + { + "epoch": 0.94, + "grad_norm": 1.3479461669921875, + "learning_rate": 9.241480644430644e-08, + "loss": 0.7366, + "step": 7341 + }, + { + "epoch": 0.94, + "grad_norm": 1.926008701324463, + "learning_rate": 9.201812103160624e-08, + "loss": 0.6514, + "step": 7342 + }, + { + "epoch": 0.94, + "grad_norm": 1.4221993684768677, + "learning_rate": 9.162228091902593e-08, + "loss": 0.5706, + "step": 7343 + }, + { + "epoch": 0.94, + "grad_norm": 1.6545454263687134, + "learning_rate": 9.122728617474153e-08, + "loss": 0.6052, + "step": 7344 + }, + { + "epoch": 0.94, + "grad_norm": 1.3921095132827759, + "learning_rate": 9.083313686678252e-08, + "loss": 0.5529, + "step": 7345 + }, + { + "epoch": 0.94, + "grad_norm": 1.1098374128341675, + "learning_rate": 9.043983306303295e-08, + "loss": 0.7025, + "step": 7346 + }, + { + "epoch": 0.94, + "grad_norm": 1.4808984994888306, + "learning_rate": 9.004737483123194e-08, + "loss": 0.6095, + "step": 7347 + }, + { + "epoch": 0.94, + "grad_norm": 1.2280536890029907, + "learning_rate": 8.96557622389721e-08, + "loss": 0.6116, + "step": 7348 + }, + { + "epoch": 0.94, + "grad_norm": 1.2250851392745972, + "learning_rate": 8.926499535370058e-08, + "loss": 0.5939, + "step": 7349 + }, + { + "epoch": 0.94, + "grad_norm": 1.9256665706634521, + "learning_rate": 8.887507424271913e-08, + "loss": 0.5694, + "step": 7350 + }, + { + "epoch": 0.94, + "grad_norm": 1.2938241958618164, + "learning_rate": 8.8485998973184e-08, + "loss": 0.5579, + "step": 7351 + }, + { + "epoch": 0.94, + "grad_norm": 1.3634623289108276, + "learning_rate": 8.809776961210493e-08, + "loss": 0.5262, + "step": 7352 + }, + { + "epoch": 0.94, + "grad_norm": 1.161146879196167, + "learning_rate": 8.771038622634731e-08, + "loss": 0.5316, + "step": 7353 + }, + { + "epoch": 0.94, + "grad_norm": 1.3339877128601074, + "learning_rate": 8.732384888262945e-08, + "loss": 0.5637, + "step": 7354 + }, + { + "epoch": 0.94, + "grad_norm": 1.254198670387268, + "learning_rate": 8.693815764752477e-08, + "loss": 0.6377, + "step": 7355 + }, + { + "epoch": 0.94, + "grad_norm": 1.2894302606582642, + "learning_rate": 8.65533125874607e-08, + "loss": 0.5951, + "step": 7356 + }, + { + "epoch": 0.94, + "grad_norm": 1.4392236471176147, + "learning_rate": 8.616931376871917e-08, + "loss": 0.6323, + "step": 7357 + }, + { + "epoch": 0.94, + "grad_norm": 1.222009539604187, + "learning_rate": 8.578616125743567e-08, + "loss": 0.6005, + "step": 7358 + }, + { + "epoch": 0.94, + "grad_norm": 1.019447922706604, + "learning_rate": 8.540385511960126e-08, + "loss": 0.5459, + "step": 7359 + }, + { + "epoch": 0.94, + "grad_norm": 1.4560258388519287, + "learning_rate": 8.502239542105994e-08, + "loss": 0.6215, + "step": 7360 + }, + { + "epoch": 0.94, + "grad_norm": 1.296736240386963, + "learning_rate": 8.464178222750974e-08, + "loss": 0.5495, + "step": 7361 + }, + { + "epoch": 0.94, + "grad_norm": 1.5001124143600464, + "learning_rate": 8.426201560450431e-08, + "loss": 0.6199, + "step": 7362 + }, + { + "epoch": 0.94, + "grad_norm": 1.5553319454193115, + "learning_rate": 8.388309561745134e-08, + "loss": 0.5854, + "step": 7363 + }, + { + "epoch": 0.94, + "grad_norm": 1.3460683822631836, + "learning_rate": 8.350502233161084e-08, + "loss": 0.5443, + "step": 7364 + }, + { + "epoch": 0.94, + "grad_norm": 1.374433994293213, + "learning_rate": 8.312779581209852e-08, + "loss": 0.6094, + "step": 7365 + }, + { + "epoch": 0.94, + "grad_norm": 1.3948500156402588, + "learning_rate": 8.27514161238846e-08, + "loss": 0.5573, + "step": 7366 + }, + { + "epoch": 0.94, + "grad_norm": 1.369402289390564, + "learning_rate": 8.237588333179225e-08, + "loss": 0.4974, + "step": 7367 + }, + { + "epoch": 0.94, + "grad_norm": 1.2202465534210205, + "learning_rate": 8.200119750049918e-08, + "loss": 0.5171, + "step": 7368 + }, + { + "epoch": 0.94, + "grad_norm": 1.5176337957382202, + "learning_rate": 8.162735869453764e-08, + "loss": 0.6837, + "step": 7369 + }, + { + "epoch": 0.94, + "grad_norm": 2.0145180225372314, + "learning_rate": 8.125436697829503e-08, + "loss": 0.5776, + "step": 7370 + }, + { + "epoch": 0.94, + "grad_norm": 1.281848669052124, + "learning_rate": 8.08822224160094e-08, + "loss": 0.6065, + "step": 7371 + }, + { + "epoch": 0.94, + "grad_norm": 1.6321043968200684, + "learning_rate": 8.051092507177672e-08, + "loss": 0.6207, + "step": 7372 + }, + { + "epoch": 0.94, + "grad_norm": 1.5435349941253662, + "learning_rate": 8.014047500954524e-08, + "loss": 0.6096, + "step": 7373 + }, + { + "epoch": 0.94, + "grad_norm": 1.468544840812683, + "learning_rate": 7.97708722931162e-08, + "loss": 0.6066, + "step": 7374 + }, + { + "epoch": 0.94, + "grad_norm": 2.739684581756592, + "learning_rate": 7.940211698614753e-08, + "loss": 0.5939, + "step": 7375 + }, + { + "epoch": 0.94, + "grad_norm": 1.334362506866455, + "learning_rate": 7.903420915214899e-08, + "loss": 0.4706, + "step": 7376 + }, + { + "epoch": 0.95, + "grad_norm": 1.2078897953033447, + "learning_rate": 7.866714885448601e-08, + "loss": 0.5775, + "step": 7377 + }, + { + "epoch": 0.95, + "grad_norm": 1.5452244281768799, + "learning_rate": 7.830093615637635e-08, + "loss": 0.6114, + "step": 7378 + }, + { + "epoch": 0.95, + "grad_norm": 1.3688369989395142, + "learning_rate": 7.793557112089345e-08, + "loss": 0.5782, + "step": 7379 + }, + { + "epoch": 0.95, + "grad_norm": 1.3603882789611816, + "learning_rate": 7.757105381096363e-08, + "loss": 0.6609, + "step": 7380 + }, + { + "epoch": 0.95, + "grad_norm": 1.3486891984939575, + "learning_rate": 7.72073842893678e-08, + "loss": 0.5362, + "step": 7381 + }, + { + "epoch": 0.95, + "grad_norm": 1.3633257150650024, + "learning_rate": 7.684456261874085e-08, + "loss": 0.637, + "step": 7382 + }, + { + "epoch": 0.95, + "grad_norm": 1.2990694046020508, + "learning_rate": 7.648258886157056e-08, + "loss": 0.5077, + "step": 7383 + }, + { + "epoch": 0.95, + "grad_norm": 1.217881202697754, + "learning_rate": 7.612146308019985e-08, + "loss": 0.5874, + "step": 7384 + }, + { + "epoch": 0.95, + "grad_norm": 1.1617398262023926, + "learning_rate": 7.57611853368262e-08, + "loss": 0.5252, + "step": 7385 + }, + { + "epoch": 0.95, + "grad_norm": 1.2338204383850098, + "learning_rate": 7.540175569349883e-08, + "loss": 0.5152, + "step": 7386 + }, + { + "epoch": 0.95, + "grad_norm": 1.3661803007125854, + "learning_rate": 7.504317421212326e-08, + "loss": 0.5535, + "step": 7387 + }, + { + "epoch": 0.95, + "grad_norm": 1.4716441631317139, + "learning_rate": 7.468544095445784e-08, + "loss": 0.6154, + "step": 7388 + }, + { + "epoch": 0.95, + "grad_norm": 1.3351925611495972, + "learning_rate": 7.432855598211386e-08, + "loss": 0.6322, + "step": 7389 + }, + { + "epoch": 0.95, + "grad_norm": 1.1938472986221313, + "learning_rate": 7.397251935655825e-08, + "loss": 0.5131, + "step": 7390 + }, + { + "epoch": 0.95, + "grad_norm": 1.2754592895507812, + "learning_rate": 7.36173311391114e-08, + "loss": 0.5288, + "step": 7391 + }, + { + "epoch": 0.95, + "grad_norm": 1.3025398254394531, + "learning_rate": 7.326299139094662e-08, + "loss": 0.6064, + "step": 7392 + }, + { + "epoch": 0.95, + "grad_norm": 1.4515681266784668, + "learning_rate": 7.290950017309173e-08, + "loss": 0.5584, + "step": 7393 + }, + { + "epoch": 0.95, + "grad_norm": 1.6512247323989868, + "learning_rate": 7.255685754642805e-08, + "loss": 0.5702, + "step": 7394 + }, + { + "epoch": 0.95, + "grad_norm": 1.3409143686294556, + "learning_rate": 7.220506357169255e-08, + "loss": 0.5642, + "step": 7395 + }, + { + "epoch": 0.95, + "grad_norm": 1.2338666915893555, + "learning_rate": 7.185411830947342e-08, + "loss": 0.5008, + "step": 7396 + }, + { + "epoch": 0.95, + "grad_norm": 1.2696747779846191, + "learning_rate": 7.150402182021399e-08, + "loss": 0.5333, + "step": 7397 + }, + { + "epoch": 0.95, + "grad_norm": 1.2724928855895996, + "learning_rate": 7.115477416421101e-08, + "loss": 0.6582, + "step": 7398 + }, + { + "epoch": 0.95, + "grad_norm": 1.1926487684249878, + "learning_rate": 7.080637540161583e-08, + "loss": 0.6197, + "step": 7399 + }, + { + "epoch": 0.95, + "grad_norm": 1.4144186973571777, + "learning_rate": 7.045882559243322e-08, + "loss": 0.6195, + "step": 7400 + }, + { + "epoch": 0.95, + "grad_norm": 1.3399546146392822, + "learning_rate": 7.011212479652141e-08, + "loss": 0.5682, + "step": 7401 + }, + { + "epoch": 0.95, + "grad_norm": 1.146112084388733, + "learning_rate": 6.976627307359208e-08, + "loss": 0.5884, + "step": 7402 + }, + { + "epoch": 0.95, + "grad_norm": 1.5165796279907227, + "learning_rate": 6.94212704832109e-08, + "loss": 0.6208, + "step": 7403 + }, + { + "epoch": 0.95, + "grad_norm": 1.4220664501190186, + "learning_rate": 6.90771170847987e-08, + "loss": 0.6322, + "step": 7404 + }, + { + "epoch": 0.95, + "grad_norm": 1.4657684564590454, + "learning_rate": 6.873381293762859e-08, + "loss": 0.5636, + "step": 7405 + }, + { + "epoch": 0.95, + "grad_norm": 1.468279480934143, + "learning_rate": 6.839135810082664e-08, + "loss": 0.5901, + "step": 7406 + }, + { + "epoch": 0.95, + "grad_norm": 2.511237144470215, + "learning_rate": 6.804975263337565e-08, + "loss": 0.5578, + "step": 7407 + }, + { + "epoch": 0.95, + "grad_norm": 1.1953728199005127, + "learning_rate": 6.770899659410856e-08, + "loss": 0.5636, + "step": 7408 + }, + { + "epoch": 0.95, + "grad_norm": 1.134719967842102, + "learning_rate": 6.736909004171399e-08, + "loss": 0.6885, + "step": 7409 + }, + { + "epoch": 0.95, + "grad_norm": 1.5010466575622559, + "learning_rate": 6.703003303473454e-08, + "loss": 0.5951, + "step": 7410 + }, + { + "epoch": 0.95, + "grad_norm": 1.215299367904663, + "learning_rate": 6.669182563156574e-08, + "loss": 0.5923, + "step": 7411 + }, + { + "epoch": 0.95, + "grad_norm": 1.380470633506775, + "learning_rate": 6.635446789045651e-08, + "loss": 0.6177, + "step": 7412 + }, + { + "epoch": 0.95, + "grad_norm": 1.3253923654556274, + "learning_rate": 6.601795986951044e-08, + "loss": 0.6369, + "step": 7413 + }, + { + "epoch": 0.95, + "grad_norm": 1.4012606143951416, + "learning_rate": 6.568230162668332e-08, + "loss": 0.6044, + "step": 7414 + }, + { + "epoch": 0.95, + "grad_norm": 1.2084659337997437, + "learning_rate": 6.534749321978617e-08, + "loss": 0.7131, + "step": 7415 + }, + { + "epoch": 0.95, + "grad_norm": 1.2429335117340088, + "learning_rate": 6.501353470648286e-08, + "loss": 0.5506, + "step": 7416 + }, + { + "epoch": 0.95, + "grad_norm": 1.1838606595993042, + "learning_rate": 6.46804261442907e-08, + "loss": 0.581, + "step": 7417 + }, + { + "epoch": 0.95, + "grad_norm": 1.5430114269256592, + "learning_rate": 6.43481675905816e-08, + "loss": 0.6428, + "step": 7418 + }, + { + "epoch": 0.95, + "grad_norm": 1.4366458654403687, + "learning_rate": 6.40167591025792e-08, + "loss": 0.5344, + "step": 7419 + }, + { + "epoch": 0.95, + "grad_norm": 1.7072436809539795, + "learning_rate": 6.368620073736176e-08, + "loss": 0.6056, + "step": 7420 + }, + { + "epoch": 0.95, + "grad_norm": 1.645785927772522, + "learning_rate": 6.335649255186315e-08, + "loss": 0.6395, + "step": 7421 + }, + { + "epoch": 0.95, + "grad_norm": 1.3543944358825684, + "learning_rate": 6.302763460286687e-08, + "loss": 0.5263, + "step": 7422 + }, + { + "epoch": 0.95, + "grad_norm": 2.483001232147217, + "learning_rate": 6.269962694701315e-08, + "loss": 0.5472, + "step": 7423 + }, + { + "epoch": 0.95, + "grad_norm": 1.2230137586593628, + "learning_rate": 6.237246964079458e-08, + "loss": 0.6058, + "step": 7424 + }, + { + "epoch": 0.95, + "grad_norm": 1.5250579118728638, + "learning_rate": 6.204616274055663e-08, + "loss": 0.6588, + "step": 7425 + }, + { + "epoch": 0.95, + "grad_norm": 1.269244909286499, + "learning_rate": 6.172070630249993e-08, + "loss": 0.5417, + "step": 7426 + }, + { + "epoch": 0.95, + "grad_norm": 1.3151706457138062, + "learning_rate": 6.139610038267685e-08, + "loss": 0.548, + "step": 7427 + }, + { + "epoch": 0.95, + "grad_norm": 1.407835602760315, + "learning_rate": 6.107234503699488e-08, + "loss": 0.6501, + "step": 7428 + }, + { + "epoch": 0.95, + "grad_norm": 1.3132225275039673, + "learning_rate": 6.074944032121388e-08, + "loss": 0.4961, + "step": 7429 + }, + { + "epoch": 0.95, + "grad_norm": 1.4748717546463013, + "learning_rate": 6.042738629094769e-08, + "loss": 0.5878, + "step": 7430 + }, + { + "epoch": 0.95, + "grad_norm": 2.3013241291046143, + "learning_rate": 6.010618300166415e-08, + "loss": 0.5828, + "step": 7431 + }, + { + "epoch": 0.95, + "grad_norm": 1.3468983173370361, + "learning_rate": 5.978583050868292e-08, + "loss": 0.6095, + "step": 7432 + }, + { + "epoch": 0.95, + "grad_norm": 2.144441604614258, + "learning_rate": 5.946632886717929e-08, + "loss": 0.5951, + "step": 7433 + }, + { + "epoch": 0.95, + "grad_norm": 1.5819791555404663, + "learning_rate": 5.914767813217981e-08, + "loss": 0.6179, + "step": 7434 + }, + { + "epoch": 0.95, + "grad_norm": 1.079916000366211, + "learning_rate": 5.882987835856668e-08, + "loss": 0.6712, + "step": 7435 + }, + { + "epoch": 0.95, + "grad_norm": 1.3036378622055054, + "learning_rate": 5.851292960107391e-08, + "loss": 0.5571, + "step": 7436 + }, + { + "epoch": 0.95, + "grad_norm": 1.3435218334197998, + "learning_rate": 5.8196831914289485e-08, + "loss": 0.6113, + "step": 7437 + }, + { + "epoch": 0.95, + "grad_norm": 1.429049015045166, + "learning_rate": 5.788158535265431e-08, + "loss": 0.5887, + "step": 7438 + }, + { + "epoch": 0.95, + "grad_norm": 1.3666672706604004, + "learning_rate": 5.75671899704644e-08, + "loss": 0.6005, + "step": 7439 + }, + { + "epoch": 0.95, + "grad_norm": 1.053040623664856, + "learning_rate": 5.7253645821867546e-08, + "loss": 0.4651, + "step": 7440 + }, + { + "epoch": 0.95, + "grad_norm": 1.2168482542037964, + "learning_rate": 5.694095296086444e-08, + "loss": 0.5997, + "step": 7441 + }, + { + "epoch": 0.95, + "grad_norm": 1.2379217147827148, + "learning_rate": 5.662911144131145e-08, + "loss": 0.6519, + "step": 7442 + }, + { + "epoch": 0.95, + "grad_norm": 1.175512671470642, + "learning_rate": 5.631812131691561e-08, + "loss": 0.5748, + "step": 7443 + }, + { + "epoch": 0.95, + "grad_norm": 1.4115569591522217, + "learning_rate": 5.600798264123963e-08, + "loss": 0.5639, + "step": 7444 + }, + { + "epoch": 0.95, + "grad_norm": 1.4109700918197632, + "learning_rate": 5.569869546769857e-08, + "loss": 0.58, + "step": 7445 + }, + { + "epoch": 0.95, + "grad_norm": 1.6279006004333496, + "learning_rate": 5.5390259849559813e-08, + "loss": 0.5886, + "step": 7446 + }, + { + "epoch": 0.95, + "grad_norm": 1.9307910203933716, + "learning_rate": 5.508267583994642e-08, + "loss": 0.5608, + "step": 7447 + }, + { + "epoch": 0.95, + "grad_norm": 1.3651392459869385, + "learning_rate": 5.4775943491832684e-08, + "loss": 0.6131, + "step": 7448 + }, + { + "epoch": 0.95, + "grad_norm": 1.3257839679718018, + "learning_rate": 5.4470062858047457e-08, + "loss": 0.6027, + "step": 7449 + }, + { + "epoch": 0.95, + "grad_norm": 1.4429939985275269, + "learning_rate": 5.4165033991271934e-08, + "loss": 0.5893, + "step": 7450 + }, + { + "epoch": 0.95, + "grad_norm": 1.618682622909546, + "learning_rate": 5.3860856944041303e-08, + "loss": 0.5665, + "step": 7451 + }, + { + "epoch": 0.95, + "grad_norm": 1.1508351564407349, + "learning_rate": 5.3557531768744784e-08, + "loss": 0.6654, + "step": 7452 + }, + { + "epoch": 0.95, + "grad_norm": 1.1664789915084839, + "learning_rate": 5.32550585176228e-08, + "loss": 0.6164, + "step": 7453 + }, + { + "epoch": 0.95, + "grad_norm": 1.3211183547973633, + "learning_rate": 5.29534372427698e-08, + "loss": 0.61, + "step": 7454 + }, + { + "epoch": 0.96, + "grad_norm": 1.6111810207366943, + "learning_rate": 5.2652667996135884e-08, + "loss": 0.602, + "step": 7455 + }, + { + "epoch": 0.96, + "grad_norm": 5.7178215980529785, + "learning_rate": 5.2352750829521294e-08, + "loss": 0.5855, + "step": 7456 + }, + { + "epoch": 0.96, + "grad_norm": 1.387904167175293, + "learning_rate": 5.205368579458025e-08, + "loss": 0.6116, + "step": 7457 + }, + { + "epoch": 0.96, + "grad_norm": 6.927826881408691, + "learning_rate": 5.175547294282102e-08, + "loss": 0.629, + "step": 7458 + }, + { + "epoch": 0.96, + "grad_norm": 1.1902652978897095, + "learning_rate": 5.145811232560527e-08, + "loss": 0.5822, + "step": 7459 + }, + { + "epoch": 0.96, + "grad_norm": 1.764687180519104, + "learning_rate": 5.116160399414649e-08, + "loss": 0.6164, + "step": 7460 + }, + { + "epoch": 0.96, + "grad_norm": 1.2136625051498413, + "learning_rate": 5.0865947999512166e-08, + "loss": 0.5099, + "step": 7461 + }, + { + "epoch": 0.96, + "grad_norm": 1.5262752771377563, + "learning_rate": 5.057114439262378e-08, + "loss": 0.5656, + "step": 7462 + }, + { + "epoch": 0.96, + "grad_norm": 1.331688642501831, + "learning_rate": 5.0277193224255175e-08, + "loss": 0.5583, + "step": 7463 + }, + { + "epoch": 0.96, + "grad_norm": 1.4355744123458862, + "learning_rate": 4.9984094545033054e-08, + "loss": 0.5641, + "step": 7464 + }, + { + "epoch": 0.96, + "grad_norm": 1.4766184091567993, + "learning_rate": 4.969184840543706e-08, + "loss": 0.5645, + "step": 7465 + }, + { + "epoch": 0.96, + "grad_norm": 1.319823980331421, + "learning_rate": 4.940045485580247e-08, + "loss": 0.5336, + "step": 7466 + }, + { + "epoch": 0.96, + "grad_norm": 1.2531654834747314, + "learning_rate": 4.9109913946314145e-08, + "loss": 0.6307, + "step": 7467 + }, + { + "epoch": 0.96, + "grad_norm": 1.4141623973846436, + "learning_rate": 4.882022572701261e-08, + "loss": 0.5306, + "step": 7468 + }, + { + "epoch": 0.96, + "grad_norm": 1.1841273307800293, + "learning_rate": 4.8531390247790745e-08, + "loss": 0.5981, + "step": 7469 + }, + { + "epoch": 0.96, + "grad_norm": 1.325745940208435, + "learning_rate": 4.824340755839485e-08, + "loss": 0.5935, + "step": 7470 + }, + { + "epoch": 0.96, + "grad_norm": 1.4514849185943604, + "learning_rate": 4.795627770842359e-08, + "loss": 0.6294, + "step": 7471 + }, + { + "epoch": 0.96, + "grad_norm": 1.2255972623825073, + "learning_rate": 4.767000074732908e-08, + "loss": 0.5396, + "step": 7472 + }, + { + "epoch": 0.96, + "grad_norm": 1.6118378639221191, + "learning_rate": 4.738457672441799e-08, + "loss": 0.6553, + "step": 7473 + }, + { + "epoch": 0.96, + "grad_norm": 1.5968915224075317, + "learning_rate": 4.710000568884709e-08, + "loss": 0.6712, + "step": 7474 + }, + { + "epoch": 0.96, + "grad_norm": 1.486005425453186, + "learning_rate": 4.681628768962887e-08, + "loss": 0.6427, + "step": 7475 + }, + { + "epoch": 0.96, + "grad_norm": 1.2320314645767212, + "learning_rate": 4.653342277562811e-08, + "loss": 0.7184, + "step": 7476 + }, + { + "epoch": 0.96, + "grad_norm": 1.1806683540344238, + "learning_rate": 4.625141099556252e-08, + "loss": 0.7453, + "step": 7477 + }, + { + "epoch": 0.96, + "grad_norm": 1.3961061239242554, + "learning_rate": 4.597025239800212e-08, + "loss": 0.5866, + "step": 7478 + }, + { + "epoch": 0.96, + "grad_norm": 1.8109081983566284, + "learning_rate": 4.5689947031371527e-08, + "loss": 0.592, + "step": 7479 + }, + { + "epoch": 0.96, + "grad_norm": 1.4340118169784546, + "learning_rate": 4.541049494394767e-08, + "loss": 0.5356, + "step": 7480 + }, + { + "epoch": 0.96, + "grad_norm": 1.403652548789978, + "learning_rate": 4.513189618385983e-08, + "loss": 0.6765, + "step": 7481 + }, + { + "epoch": 0.96, + "grad_norm": 1.5432109832763672, + "learning_rate": 4.4854150799091836e-08, + "loss": 0.4902, + "step": 7482 + }, + { + "epoch": 0.96, + "grad_norm": 1.5097144842147827, + "learning_rate": 4.457725883747932e-08, + "loss": 0.6542, + "step": 7483 + }, + { + "epoch": 0.96, + "grad_norm": 1.4052143096923828, + "learning_rate": 4.4301220346710806e-08, + "loss": 0.5848, + "step": 7484 + }, + { + "epoch": 0.96, + "grad_norm": 1.4552314281463623, + "learning_rate": 4.402603537432937e-08, + "loss": 0.5903, + "step": 7485 + }, + { + "epoch": 0.96, + "grad_norm": 1.6251884698867798, + "learning_rate": 4.3751703967728765e-08, + "loss": 0.5735, + "step": 7486 + }, + { + "epoch": 0.96, + "grad_norm": 1.1430637836456299, + "learning_rate": 4.347822617415787e-08, + "loss": 0.718, + "step": 7487 + }, + { + "epoch": 0.96, + "grad_norm": 1.0503935813903809, + "learning_rate": 4.320560204071733e-08, + "loss": 0.6921, + "step": 7488 + }, + { + "epoch": 0.96, + "grad_norm": 1.4668529033660889, + "learning_rate": 4.293383161436182e-08, + "loss": 0.6366, + "step": 7489 + }, + { + "epoch": 0.96, + "grad_norm": 1.753491759300232, + "learning_rate": 4.2662914941897225e-08, + "loss": 0.5208, + "step": 7490 + }, + { + "epoch": 0.96, + "grad_norm": 1.3943778276443481, + "learning_rate": 4.239285206998345e-08, + "loss": 0.5593, + "step": 7491 + }, + { + "epoch": 0.96, + "grad_norm": 1.3868759870529175, + "learning_rate": 4.212364304513439e-08, + "loss": 0.6141, + "step": 7492 + }, + { + "epoch": 0.96, + "grad_norm": 1.3776869773864746, + "learning_rate": 4.185528791371518e-08, + "loss": 0.5684, + "step": 7493 + }, + { + "epoch": 0.96, + "grad_norm": 1.3493459224700928, + "learning_rate": 4.158778672194441e-08, + "loss": 0.5887, + "step": 7494 + }, + { + "epoch": 0.96, + "grad_norm": 1.200941801071167, + "learning_rate": 4.1321139515894116e-08, + "loss": 0.5237, + "step": 7495 + }, + { + "epoch": 0.96, + "grad_norm": 1.162359595298767, + "learning_rate": 4.105534634148867e-08, + "loss": 0.5887, + "step": 7496 + }, + { + "epoch": 0.96, + "grad_norm": 1.3749542236328125, + "learning_rate": 4.0790407244505335e-08, + "loss": 0.5458, + "step": 7497 + }, + { + "epoch": 0.96, + "grad_norm": 1.2350449562072754, + "learning_rate": 4.052632227057429e-08, + "loss": 0.5914, + "step": 7498 + }, + { + "epoch": 0.96, + "grad_norm": 1.3565806150436401, + "learning_rate": 4.02630914651797e-08, + "loss": 0.5873, + "step": 7499 + }, + { + "epoch": 0.96, + "grad_norm": 1.3490086793899536, + "learning_rate": 4.0000714873656976e-08, + "loss": 0.588, + "step": 7500 + }, + { + "epoch": 0.96, + "grad_norm": 1.4163496494293213, + "learning_rate": 3.973919254119551e-08, + "loss": 0.6271, + "step": 7501 + }, + { + "epoch": 0.96, + "grad_norm": 1.4125710725784302, + "learning_rate": 3.947852451283707e-08, + "loss": 0.6185, + "step": 7502 + }, + { + "epoch": 0.96, + "grad_norm": 1.3267148733139038, + "learning_rate": 3.921871083347628e-08, + "loss": 0.6035, + "step": 7503 + }, + { + "epoch": 0.96, + "grad_norm": 1.564961314201355, + "learning_rate": 3.8959751547861244e-08, + "loss": 0.6344, + "step": 7504 + }, + { + "epoch": 0.96, + "grad_norm": 1.7320315837860107, + "learning_rate": 3.870164670059129e-08, + "loss": 0.6125, + "step": 7505 + }, + { + "epoch": 0.96, + "grad_norm": 1.794987440109253, + "learning_rate": 3.844439633612085e-08, + "loss": 0.5969, + "step": 7506 + }, + { + "epoch": 0.96, + "grad_norm": 1.538055181503296, + "learning_rate": 3.818800049875615e-08, + "loss": 0.582, + "step": 7507 + }, + { + "epoch": 0.96, + "grad_norm": 1.460828185081482, + "learning_rate": 3.793245923265576e-08, + "loss": 0.6452, + "step": 7508 + }, + { + "epoch": 0.96, + "grad_norm": 1.2221049070358276, + "learning_rate": 3.767777258183114e-08, + "loss": 0.5363, + "step": 7509 + }, + { + "epoch": 0.96, + "grad_norm": 1.714708685874939, + "learning_rate": 3.74239405901472e-08, + "loss": 0.5413, + "step": 7510 + }, + { + "epoch": 0.96, + "grad_norm": 1.3533588647842407, + "learning_rate": 3.717096330132175e-08, + "loss": 0.6549, + "step": 7511 + }, + { + "epoch": 0.96, + "grad_norm": 1.2847508192062378, + "learning_rate": 3.691884075892438e-08, + "loss": 0.5459, + "step": 7512 + }, + { + "epoch": 0.96, + "grad_norm": 1.5215178728103638, + "learning_rate": 3.666757300637813e-08, + "loss": 0.6188, + "step": 7513 + }, + { + "epoch": 0.96, + "grad_norm": 1.9925087690353394, + "learning_rate": 3.6417160086959505e-08, + "loss": 0.5738, + "step": 7514 + }, + { + "epoch": 0.96, + "grad_norm": 1.384124755859375, + "learning_rate": 3.616760204379621e-08, + "loss": 0.6303, + "step": 7515 + }, + { + "epoch": 0.96, + "grad_norm": 1.3988687992095947, + "learning_rate": 3.5918898919869436e-08, + "loss": 0.5765, + "step": 7516 + }, + { + "epoch": 0.96, + "grad_norm": 1.3108985424041748, + "learning_rate": 3.5671050758013806e-08, + "loss": 0.5715, + "step": 7517 + }, + { + "epoch": 0.96, + "grad_norm": 1.2928988933563232, + "learning_rate": 3.542405760091627e-08, + "loss": 0.6436, + "step": 7518 + }, + { + "epoch": 0.96, + "grad_norm": 1.6511348485946655, + "learning_rate": 3.5177919491115595e-08, + "loss": 0.6539, + "step": 7519 + }, + { + "epoch": 0.96, + "grad_norm": 1.209773063659668, + "learning_rate": 3.493263647100453e-08, + "loss": 0.5657, + "step": 7520 + }, + { + "epoch": 0.96, + "grad_norm": 1.4642279148101807, + "learning_rate": 3.468820858282818e-08, + "loss": 0.5891, + "step": 7521 + }, + { + "epoch": 0.96, + "grad_norm": 1.4938738346099854, + "learning_rate": 3.444463586868452e-08, + "loss": 0.5184, + "step": 7522 + }, + { + "epoch": 0.96, + "grad_norm": 1.524194598197937, + "learning_rate": 3.420191837052278e-08, + "loss": 0.5887, + "step": 7523 + }, + { + "epoch": 0.96, + "grad_norm": 1.2606900930404663, + "learning_rate": 3.3960056130147855e-08, + "loss": 0.5932, + "step": 7524 + }, + { + "epoch": 0.96, + "grad_norm": 1.350730538368225, + "learning_rate": 3.37190491892142e-08, + "loss": 0.5891, + "step": 7525 + }, + { + "epoch": 0.96, + "grad_norm": 1.3839001655578613, + "learning_rate": 3.34788975892314e-08, + "loss": 0.5905, + "step": 7526 + }, + { + "epoch": 0.96, + "grad_norm": 1.6060885190963745, + "learning_rate": 3.3239601371559685e-08, + "loss": 0.5926, + "step": 7527 + }, + { + "epoch": 0.96, + "grad_norm": 1.3335615396499634, + "learning_rate": 3.3001160577413873e-08, + "loss": 0.6283, + "step": 7528 + }, + { + "epoch": 0.96, + "grad_norm": 1.4750850200653076, + "learning_rate": 3.276357524786e-08, + "loss": 0.5586, + "step": 7529 + }, + { + "epoch": 0.96, + "grad_norm": 1.4482122659683228, + "learning_rate": 3.252684542381701e-08, + "loss": 0.589, + "step": 7530 + }, + { + "epoch": 0.96, + "grad_norm": 1.4635100364685059, + "learning_rate": 3.229097114605784e-08, + "loss": 0.6155, + "step": 7531 + }, + { + "epoch": 0.96, + "grad_norm": 1.282564640045166, + "learning_rate": 3.20559524552061e-08, + "loss": 0.6131, + "step": 7532 + }, + { + "epoch": 0.97, + "grad_norm": 1.5637482404708862, + "learning_rate": 3.182178939173941e-08, + "loss": 0.6289, + "step": 7533 + }, + { + "epoch": 0.97, + "grad_norm": 1.3659104108810425, + "learning_rate": 3.1588481995987743e-08, + "loss": 0.5984, + "step": 7534 + }, + { + "epoch": 0.97, + "grad_norm": 1.998304843902588, + "learning_rate": 3.1356030308132834e-08, + "loss": 0.5942, + "step": 7535 + }, + { + "epoch": 0.97, + "grad_norm": 0.9875431656837463, + "learning_rate": 3.1124434368210444e-08, + "loss": 0.6936, + "step": 7536 + }, + { + "epoch": 0.97, + "grad_norm": 1.2541671991348267, + "learning_rate": 3.089369421610866e-08, + "loss": 0.6214, + "step": 7537 + }, + { + "epoch": 0.97, + "grad_norm": 1.374961018562317, + "learning_rate": 3.066380989156625e-08, + "loss": 0.5787, + "step": 7538 + }, + { + "epoch": 0.97, + "grad_norm": 1.4336551427841187, + "learning_rate": 3.043478143417766e-08, + "loss": 0.6287, + "step": 7539 + }, + { + "epoch": 0.97, + "grad_norm": 1.1535704135894775, + "learning_rate": 3.0206608883387444e-08, + "loss": 0.7096, + "step": 7540 + }, + { + "epoch": 0.97, + "grad_norm": 1.3666244745254517, + "learning_rate": 2.997929227849416e-08, + "loss": 0.5903, + "step": 7541 + }, + { + "epoch": 0.97, + "grad_norm": 1.4516652822494507, + "learning_rate": 2.9752831658648175e-08, + "loss": 0.6077, + "step": 7542 + }, + { + "epoch": 0.97, + "grad_norm": 1.3812227249145508, + "learning_rate": 2.952722706285327e-08, + "loss": 0.6462, + "step": 7543 + }, + { + "epoch": 0.97, + "grad_norm": 1.4420405626296997, + "learning_rate": 2.9302478529964484e-08, + "loss": 0.6442, + "step": 7544 + }, + { + "epoch": 0.97, + "grad_norm": 1.4087380170822144, + "learning_rate": 2.907858609869085e-08, + "loss": 0.5694, + "step": 7545 + }, + { + "epoch": 0.97, + "grad_norm": 1.8472365140914917, + "learning_rate": 2.8855549807592644e-08, + "loss": 0.6193, + "step": 7546 + }, + { + "epoch": 0.97, + "grad_norm": 1.2451483011245728, + "learning_rate": 2.8633369695084125e-08, + "loss": 0.5944, + "step": 7547 + }, + { + "epoch": 0.97, + "grad_norm": 1.3169342279434204, + "learning_rate": 2.84120457994308e-08, + "loss": 0.5901, + "step": 7548 + }, + { + "epoch": 0.97, + "grad_norm": 1.3715953826904297, + "learning_rate": 2.8191578158751064e-08, + "loss": 0.6093, + "step": 7549 + }, + { + "epoch": 0.97, + "grad_norm": 1.386909008026123, + "learning_rate": 2.7971966811016772e-08, + "loss": 0.5776, + "step": 7550 + }, + { + "epoch": 0.97, + "grad_norm": 1.138466715812683, + "learning_rate": 2.775321179405044e-08, + "loss": 0.59, + "step": 7551 + }, + { + "epoch": 0.97, + "grad_norm": 2.033170223236084, + "learning_rate": 2.7535313145528597e-08, + "loss": 0.5274, + "step": 7552 + }, + { + "epoch": 0.97, + "grad_norm": 1.6652302742004395, + "learning_rate": 2.7318270902980116e-08, + "loss": 0.5159, + "step": 7553 + }, + { + "epoch": 0.97, + "grad_norm": 1.2595553398132324, + "learning_rate": 2.7102085103786203e-08, + "loss": 0.5573, + "step": 7554 + }, + { + "epoch": 0.97, + "grad_norm": 1.474131464958191, + "learning_rate": 2.688675578517985e-08, + "loss": 0.6597, + "step": 7555 + }, + { + "epoch": 0.97, + "grad_norm": 1.894659399986267, + "learning_rate": 2.6672282984248066e-08, + "loss": 0.5857, + "step": 7556 + }, + { + "epoch": 0.97, + "grad_norm": 1.4702141284942627, + "learning_rate": 2.6458666737927962e-08, + "loss": 0.6389, + "step": 7557 + }, + { + "epoch": 0.97, + "grad_norm": 1.2070083618164062, + "learning_rate": 2.624590708301178e-08, + "loss": 0.6223, + "step": 7558 + }, + { + "epoch": 0.97, + "grad_norm": 1.6800892353057861, + "learning_rate": 2.6034004056143535e-08, + "loss": 0.5541, + "step": 7559 + }, + { + "epoch": 0.97, + "grad_norm": 1.4756264686584473, + "learning_rate": 2.582295769381793e-08, + "loss": 0.5985, + "step": 7560 + }, + { + "epoch": 0.97, + "grad_norm": 1.7296569347381592, + "learning_rate": 2.5612768032383662e-08, + "loss": 0.5712, + "step": 7561 + }, + { + "epoch": 0.97, + "grad_norm": 1.2623050212860107, + "learning_rate": 2.540343510804233e-08, + "loss": 0.5563, + "step": 7562 + }, + { + "epoch": 0.97, + "grad_norm": 1.2337028980255127, + "learning_rate": 2.519495895684676e-08, + "loss": 0.6952, + "step": 7563 + }, + { + "epoch": 0.97, + "grad_norm": 1.9761078357696533, + "learning_rate": 2.498733961470268e-08, + "loss": 0.6188, + "step": 7564 + }, + { + "epoch": 0.97, + "grad_norm": 1.36776864528656, + "learning_rate": 2.47805771173687e-08, + "loss": 0.5815, + "step": 7565 + }, + { + "epoch": 0.97, + "grad_norm": 1.5808404684066772, + "learning_rate": 2.4574671500455227e-08, + "loss": 0.6603, + "step": 7566 + }, + { + "epoch": 0.97, + "grad_norm": 1.1816710233688354, + "learning_rate": 2.4369622799425008e-08, + "loss": 0.6366, + "step": 7567 + }, + { + "epoch": 0.97, + "grad_norm": 1.2168693542480469, + "learning_rate": 2.416543104959368e-08, + "loss": 0.593, + "step": 7568 + }, + { + "epoch": 0.97, + "grad_norm": 1.2128366231918335, + "learning_rate": 2.3962096286129778e-08, + "loss": 0.6183, + "step": 7569 + }, + { + "epoch": 0.97, + "grad_norm": 1.4425259828567505, + "learning_rate": 2.375961854405251e-08, + "loss": 0.6046, + "step": 7570 + }, + { + "epoch": 0.97, + "grad_norm": 1.308990716934204, + "learning_rate": 2.3557997858235647e-08, + "loss": 0.5935, + "step": 7571 + }, + { + "epoch": 0.97, + "grad_norm": 1.4226394891738892, + "learning_rate": 2.3357234263403083e-08, + "loss": 0.6166, + "step": 7572 + }, + { + "epoch": 0.97, + "grad_norm": 1.415466070175171, + "learning_rate": 2.3157327794133267e-08, + "loss": 0.5839, + "step": 7573 + }, + { + "epoch": 0.97, + "grad_norm": 1.2484937906265259, + "learning_rate": 2.2958278484855323e-08, + "loss": 0.5463, + "step": 7574 + }, + { + "epoch": 0.97, + "grad_norm": 1.1190849542617798, + "learning_rate": 2.2760086369851832e-08, + "loss": 0.582, + "step": 7575 + }, + { + "epoch": 0.97, + "grad_norm": 1.3897758722305298, + "learning_rate": 2.2562751483257706e-08, + "loss": 0.5186, + "step": 7576 + }, + { + "epoch": 0.97, + "grad_norm": 1.5645219087600708, + "learning_rate": 2.2366273859059095e-08, + "loss": 0.5141, + "step": 7577 + }, + { + "epoch": 0.97, + "grad_norm": 1.2559103965759277, + "learning_rate": 2.21706535310956e-08, + "loss": 0.5594, + "step": 7578 + }, + { + "epoch": 0.97, + "grad_norm": 1.294405460357666, + "learning_rate": 2.197589053305915e-08, + "loss": 0.6049, + "step": 7579 + }, + { + "epoch": 0.97, + "grad_norm": 1.4571852684020996, + "learning_rate": 2.1781984898492926e-08, + "loss": 0.6056, + "step": 7580 + }, + { + "epoch": 0.97, + "grad_norm": 1.2196654081344604, + "learning_rate": 2.158893666079409e-08, + "loss": 0.5471, + "step": 7581 + }, + { + "epoch": 0.97, + "grad_norm": 1.3833738565444946, + "learning_rate": 2.1396745853211054e-08, + "loss": 0.5686, + "step": 7582 + }, + { + "epoch": 0.97, + "grad_norm": 1.3073983192443848, + "learning_rate": 2.1205412508844557e-08, + "loss": 0.662, + "step": 7583 + }, + { + "epoch": 0.97, + "grad_norm": 1.2904523611068726, + "learning_rate": 2.1014936660647688e-08, + "loss": 0.6103, + "step": 7584 + }, + { + "epoch": 0.97, + "grad_norm": 1.7544934749603271, + "learning_rate": 2.0825318341426982e-08, + "loss": 0.5528, + "step": 7585 + }, + { + "epoch": 0.97, + "grad_norm": 1.464415192604065, + "learning_rate": 2.063655758383909e-08, + "loss": 0.6172, + "step": 7586 + }, + { + "epoch": 0.97, + "grad_norm": 2.500410795211792, + "learning_rate": 2.0448654420395232e-08, + "loss": 0.6548, + "step": 7587 + }, + { + "epoch": 0.97, + "grad_norm": 1.2668145895004272, + "learning_rate": 2.0261608883457297e-08, + "loss": 0.5142, + "step": 7588 + }, + { + "epoch": 0.97, + "grad_norm": 1.5883921384811401, + "learning_rate": 2.007542100524007e-08, + "loss": 0.5204, + "step": 7589 + }, + { + "epoch": 0.97, + "grad_norm": 1.5936336517333984, + "learning_rate": 1.9890090817811237e-08, + "loss": 0.5901, + "step": 7590 + }, + { + "epoch": 0.97, + "grad_norm": 1.3055000305175781, + "learning_rate": 1.970561835308915e-08, + "loss": 0.6495, + "step": 7591 + }, + { + "epoch": 0.97, + "grad_norm": 1.4675624370574951, + "learning_rate": 1.952200364284673e-08, + "loss": 0.6421, + "step": 7592 + }, + { + "epoch": 0.97, + "grad_norm": 1.198490023612976, + "learning_rate": 1.933924671870646e-08, + "loss": 0.6123, + "step": 7593 + }, + { + "epoch": 0.97, + "grad_norm": 1.3107367753982544, + "learning_rate": 1.915734761214594e-08, + "loss": 0.5849, + "step": 7594 + }, + { + "epoch": 0.97, + "grad_norm": 1.3395153284072876, + "learning_rate": 1.8976306354492323e-08, + "loss": 0.6503, + "step": 7595 + }, + { + "epoch": 0.97, + "grad_norm": 1.2745283842086792, + "learning_rate": 1.8796122976927343e-08, + "loss": 0.5744, + "step": 7596 + }, + { + "epoch": 0.97, + "grad_norm": 1.3492122888565063, + "learning_rate": 1.8616797510483396e-08, + "loss": 0.6037, + "step": 7597 + }, + { + "epoch": 0.97, + "grad_norm": 1.5924208164215088, + "learning_rate": 1.8438329986045224e-08, + "loss": 0.6054, + "step": 7598 + }, + { + "epoch": 0.97, + "grad_norm": 1.8689661026000977, + "learning_rate": 1.8260720434351008e-08, + "loss": 0.6192, + "step": 7599 + }, + { + "epoch": 0.97, + "grad_norm": 1.1037673950195312, + "learning_rate": 1.8083968885990178e-08, + "loss": 0.657, + "step": 7600 + }, + { + "epoch": 0.97, + "grad_norm": 1.0584919452667236, + "learning_rate": 1.7908075371404487e-08, + "loss": 0.693, + "step": 7601 + }, + { + "epoch": 0.97, + "grad_norm": 1.1882449388504028, + "learning_rate": 1.7733039920887484e-08, + "loss": 0.5358, + "step": 7602 + }, + { + "epoch": 0.97, + "grad_norm": 1.2444685697555542, + "learning_rate": 1.755886256458672e-08, + "loss": 0.6336, + "step": 7603 + }, + { + "epoch": 0.97, + "grad_norm": 1.3037827014923096, + "learning_rate": 1.738554333249931e-08, + "loss": 0.6433, + "step": 7604 + }, + { + "epoch": 0.97, + "grad_norm": 1.2642031908035278, + "learning_rate": 1.721308225447693e-08, + "loss": 0.5269, + "step": 7605 + }, + { + "epoch": 0.97, + "grad_norm": 1.2133139371871948, + "learning_rate": 1.704147936022249e-08, + "loss": 0.6345, + "step": 7606 + }, + { + "epoch": 0.97, + "grad_norm": 1.2551854848861694, + "learning_rate": 1.687073467929068e-08, + "loss": 0.503, + "step": 7607 + }, + { + "epoch": 0.97, + "grad_norm": 1.7916251420974731, + "learning_rate": 1.670084824108853e-08, + "loss": 0.5754, + "step": 7608 + }, + { + "epoch": 0.97, + "grad_norm": 1.8876594305038452, + "learning_rate": 1.6531820074876526e-08, + "loss": 0.6057, + "step": 7609 + }, + { + "epoch": 0.97, + "grad_norm": 1.3388862609863281, + "learning_rate": 1.636365020976527e-08, + "loss": 0.5536, + "step": 7610 + }, + { + "epoch": 0.98, + "grad_norm": 1.126447081565857, + "learning_rate": 1.619633867471937e-08, + "loss": 0.5904, + "step": 7611 + }, + { + "epoch": 0.98, + "grad_norm": 1.4953137636184692, + "learning_rate": 1.6029885498554108e-08, + "loss": 0.6016, + "step": 7612 + }, + { + "epoch": 0.98, + "grad_norm": 1.5743541717529297, + "learning_rate": 1.5864290709938778e-08, + "loss": 0.6243, + "step": 7613 + }, + { + "epoch": 0.98, + "grad_norm": 1.5609896183013916, + "learning_rate": 1.5699554337392786e-08, + "loss": 0.6461, + "step": 7614 + }, + { + "epoch": 0.98, + "grad_norm": 1.3598272800445557, + "learning_rate": 1.5535676409288435e-08, + "loss": 0.6131, + "step": 7615 + }, + { + "epoch": 0.98, + "grad_norm": 1.5744637250900269, + "learning_rate": 1.5372656953851482e-08, + "loss": 0.5648, + "step": 7616 + }, + { + "epoch": 0.98, + "grad_norm": 2.973106622695923, + "learning_rate": 1.521049599915725e-08, + "loss": 0.6274, + "step": 7617 + }, + { + "epoch": 0.98, + "grad_norm": 1.3239871263504028, + "learning_rate": 1.5049193573136168e-08, + "loss": 0.6198, + "step": 7618 + }, + { + "epoch": 0.98, + "grad_norm": 1.3224722146987915, + "learning_rate": 1.4888749703568239e-08, + "loss": 0.521, + "step": 7619 + }, + { + "epoch": 0.98, + "grad_norm": 1.16157865524292, + "learning_rate": 1.4729164418087472e-08, + "loss": 0.5791, + "step": 7620 + }, + { + "epoch": 0.98, + "grad_norm": 1.2755320072174072, + "learning_rate": 1.4570437744177989e-08, + "loss": 0.5609, + "step": 7621 + }, + { + "epoch": 0.98, + "grad_norm": 1.4971299171447754, + "learning_rate": 1.441256970917848e-08, + "loss": 0.5949, + "step": 7622 + }, + { + "epoch": 0.98, + "grad_norm": 1.2407373189926147, + "learning_rate": 1.4255560340277753e-08, + "loss": 0.5842, + "step": 7623 + }, + { + "epoch": 0.98, + "grad_norm": 1.3337568044662476, + "learning_rate": 1.409940966451806e-08, + "loss": 0.5259, + "step": 7624 + }, + { + "epoch": 0.98, + "grad_norm": 1.6187931299209595, + "learning_rate": 1.3944117708792338e-08, + "loss": 0.4729, + "step": 7625 + }, + { + "epoch": 0.98, + "grad_norm": 1.2470555305480957, + "learning_rate": 1.3789684499846967e-08, + "loss": 0.5533, + "step": 7626 + }, + { + "epoch": 0.98, + "grad_norm": 1.2147272825241089, + "learning_rate": 1.3636110064280117e-08, + "loss": 0.5493, + "step": 7627 + }, + { + "epoch": 0.98, + "grad_norm": 1.2246286869049072, + "learning_rate": 1.348339442854174e-08, + "loss": 0.5352, + "step": 7628 + }, + { + "epoch": 0.98, + "grad_norm": 1.2009773254394531, + "learning_rate": 1.3331537618934132e-08, + "loss": 0.6325, + "step": 7629 + }, + { + "epoch": 0.98, + "grad_norm": 1.299921989440918, + "learning_rate": 1.318053966161137e-08, + "loss": 0.4974, + "step": 7630 + }, + { + "epoch": 0.98, + "grad_norm": 1.8579556941986084, + "learning_rate": 1.3030400582579317e-08, + "loss": 0.5998, + "step": 7631 + }, + { + "epoch": 0.98, + "grad_norm": 1.709477424621582, + "learning_rate": 1.2881120407696734e-08, + "loss": 0.6101, + "step": 7632 + }, + { + "epoch": 0.98, + "grad_norm": 1.3852404356002808, + "learning_rate": 1.2732699162674721e-08, + "loss": 0.537, + "step": 7633 + }, + { + "epoch": 0.98, + "grad_norm": 1.1303093433380127, + "learning_rate": 1.2585136873075055e-08, + "loss": 0.6092, + "step": 7634 + }, + { + "epoch": 0.98, + "grad_norm": 1.6485886573791504, + "learning_rate": 1.2438433564312958e-08, + "loss": 0.5874, + "step": 7635 + }, + { + "epoch": 0.98, + "grad_norm": 1.3467371463775635, + "learning_rate": 1.2292589261654332e-08, + "loss": 0.5349, + "step": 7636 + }, + { + "epoch": 0.98, + "grad_norm": 1.4503722190856934, + "learning_rate": 1.214760399021797e-08, + "loss": 0.6611, + "step": 7637 + }, + { + "epoch": 0.98, + "grad_norm": 1.5155328512191772, + "learning_rate": 1.200347777497557e-08, + "loss": 0.489, + "step": 7638 + }, + { + "epoch": 0.98, + "grad_norm": 1.5880452394485474, + "learning_rate": 1.1860210640748936e-08, + "loss": 0.6368, + "step": 7639 + }, + { + "epoch": 0.98, + "grad_norm": 1.2064660787582397, + "learning_rate": 1.1717802612213336e-08, + "loss": 0.5961, + "step": 7640 + }, + { + "epoch": 0.98, + "grad_norm": 1.406825304031372, + "learning_rate": 1.1576253713895258e-08, + "loss": 0.5817, + "step": 7641 + }, + { + "epoch": 0.98, + "grad_norm": 2.6012203693389893, + "learning_rate": 1.1435563970174645e-08, + "loss": 0.594, + "step": 7642 + }, + { + "epoch": 0.98, + "grad_norm": 1.1902316808700562, + "learning_rate": 1.1295733405281006e-08, + "loss": 0.6312, + "step": 7643 + }, + { + "epoch": 0.98, + "grad_norm": 1.4329363107681274, + "learning_rate": 1.1156762043298963e-08, + "loss": 0.5915, + "step": 7644 + }, + { + "epoch": 0.98, + "grad_norm": 1.167524814605713, + "learning_rate": 1.1018649908162149e-08, + "loss": 0.6717, + "step": 7645 + }, + { + "epoch": 0.98, + "grad_norm": 1.4185863733291626, + "learning_rate": 1.0881397023658202e-08, + "loss": 0.6306, + "step": 7646 + }, + { + "epoch": 0.98, + "grad_norm": 1.5840102434158325, + "learning_rate": 1.0745003413425992e-08, + "loss": 0.5838, + "step": 7647 + }, + { + "epoch": 0.98, + "grad_norm": 1.3226847648620605, + "learning_rate": 1.0609469100956171e-08, + "loss": 0.5167, + "step": 7648 + }, + { + "epoch": 0.98, + "grad_norm": 1.2246990203857422, + "learning_rate": 1.0474794109592845e-08, + "loss": 0.5422, + "step": 7649 + }, + { + "epoch": 0.98, + "grad_norm": 1.3924264907836914, + "learning_rate": 1.0340978462529682e-08, + "loss": 0.6216, + "step": 7650 + }, + { + "epoch": 0.98, + "grad_norm": 1.2300564050674438, + "learning_rate": 1.0208022182814914e-08, + "loss": 0.4941, + "step": 7651 + }, + { + "epoch": 0.98, + "grad_norm": 1.2392950057983398, + "learning_rate": 1.0075925293346888e-08, + "loss": 0.5609, + "step": 7652 + }, + { + "epoch": 0.98, + "grad_norm": 1.3816360235214233, + "learning_rate": 9.944687816876296e-09, + "loss": 0.592, + "step": 7653 + }, + { + "epoch": 0.98, + "grad_norm": 1.271823763847351, + "learning_rate": 9.814309776006725e-09, + "loss": 0.5909, + "step": 7654 + }, + { + "epoch": 0.98, + "grad_norm": 1.333675503730774, + "learning_rate": 9.684791193193543e-09, + "loss": 0.6167, + "step": 7655 + }, + { + "epoch": 0.98, + "grad_norm": 1.711577296257019, + "learning_rate": 9.556132090742797e-09, + "loss": 0.5504, + "step": 7656 + }, + { + "epoch": 0.98, + "grad_norm": 1.2807025909423828, + "learning_rate": 9.42833249081343e-09, + "loss": 0.6422, + "step": 7657 + }, + { + "epoch": 0.98, + "grad_norm": 2.3574330806732178, + "learning_rate": 9.301392415417276e-09, + "loss": 0.5771, + "step": 7658 + }, + { + "epoch": 0.98, + "grad_norm": 1.2664991617202759, + "learning_rate": 9.175311886416294e-09, + "loss": 0.5728, + "step": 7659 + }, + { + "epoch": 0.98, + "grad_norm": 1.3596203327178955, + "learning_rate": 9.05009092552478e-09, + "loss": 0.5903, + "step": 7660 + }, + { + "epoch": 0.98, + "grad_norm": 1.3723080158233643, + "learning_rate": 8.925729554311036e-09, + "loss": 0.638, + "step": 7661 + }, + { + "epoch": 0.98, + "grad_norm": 1.689207673072815, + "learning_rate": 8.80222779419293e-09, + "loss": 0.6545, + "step": 7662 + }, + { + "epoch": 0.98, + "grad_norm": 1.2211661338806152, + "learning_rate": 8.679585666441226e-09, + "loss": 0.594, + "step": 7663 + }, + { + "epoch": 0.98, + "grad_norm": 1.2049944400787354, + "learning_rate": 8.557803192178471e-09, + "loss": 0.5486, + "step": 7664 + }, + { + "epoch": 0.98, + "grad_norm": 1.2144725322723389, + "learning_rate": 8.436880392379e-09, + "loss": 0.6328, + "step": 7665 + }, + { + "epoch": 0.98, + "grad_norm": 1.2572896480560303, + "learning_rate": 8.316817287870039e-09, + "loss": 0.6196, + "step": 7666 + }, + { + "epoch": 0.98, + "grad_norm": 1.505215048789978, + "learning_rate": 8.197613899329493e-09, + "loss": 0.599, + "step": 7667 + }, + { + "epoch": 0.98, + "grad_norm": 1.4084683656692505, + "learning_rate": 8.079270247287607e-09, + "loss": 0.5462, + "step": 7668 + }, + { + "epoch": 0.98, + "grad_norm": 1.439065933227539, + "learning_rate": 7.961786352126411e-09, + "loss": 0.5704, + "step": 7669 + }, + { + "epoch": 0.98, + "grad_norm": 1.5005910396575928, + "learning_rate": 7.845162234081382e-09, + "loss": 0.6232, + "step": 7670 + }, + { + "epoch": 0.98, + "grad_norm": 1.1550309658050537, + "learning_rate": 7.729397913237013e-09, + "loss": 0.5539, + "step": 7671 + }, + { + "epoch": 0.98, + "grad_norm": 1.173595666885376, + "learning_rate": 7.614493409532909e-09, + "loss": 0.5558, + "step": 7672 + }, + { + "epoch": 0.98, + "grad_norm": 1.4196832180023193, + "learning_rate": 7.500448742757682e-09, + "loss": 0.566, + "step": 7673 + }, + { + "epoch": 0.98, + "grad_norm": 1.2495452165603638, + "learning_rate": 7.387263932554511e-09, + "loss": 0.6306, + "step": 7674 + }, + { + "epoch": 0.98, + "grad_norm": 1.6128349304199219, + "learning_rate": 7.274938998415581e-09, + "loss": 0.5693, + "step": 7675 + }, + { + "epoch": 0.98, + "grad_norm": 2.603642702102661, + "learning_rate": 7.163473959688194e-09, + "loss": 0.5928, + "step": 7676 + }, + { + "epoch": 0.98, + "grad_norm": 1.5382452011108398, + "learning_rate": 7.052868835569215e-09, + "loss": 0.5689, + "step": 7677 + }, + { + "epoch": 0.98, + "grad_norm": 1.2661441564559937, + "learning_rate": 6.943123645107852e-09, + "loss": 0.5303, + "step": 7678 + }, + { + "epoch": 0.98, + "grad_norm": 1.5901883840560913, + "learning_rate": 6.8342384072056515e-09, + "loss": 0.5756, + "step": 7679 + }, + { + "epoch": 0.98, + "grad_norm": 1.394724726676941, + "learning_rate": 6.7262131406165e-09, + "loss": 0.6474, + "step": 7680 + }, + { + "epoch": 0.98, + "grad_norm": 1.2976278066635132, + "learning_rate": 6.6190478639444056e-09, + "loss": 0.5856, + "step": 7681 + }, + { + "epoch": 0.98, + "grad_norm": 1.3308930397033691, + "learning_rate": 6.512742595647381e-09, + "loss": 0.5754, + "step": 7682 + }, + { + "epoch": 0.98, + "grad_norm": 1.4971197843551636, + "learning_rate": 6.407297354034115e-09, + "loss": 0.6073, + "step": 7683 + }, + { + "epoch": 0.98, + "grad_norm": 1.3769445419311523, + "learning_rate": 6.302712157265634e-09, + "loss": 0.584, + "step": 7684 + }, + { + "epoch": 0.98, + "grad_norm": 1.4821816682815552, + "learning_rate": 6.198987023353642e-09, + "loss": 0.585, + "step": 7685 + }, + { + "epoch": 0.98, + "grad_norm": 1.0962272882461548, + "learning_rate": 6.096121970163849e-09, + "loss": 0.5703, + "step": 7686 + }, + { + "epoch": 0.98, + "grad_norm": 1.3945462703704834, + "learning_rate": 5.994117015412082e-09, + "loss": 0.5628, + "step": 7687 + }, + { + "epoch": 0.98, + "grad_norm": 1.610139012336731, + "learning_rate": 5.892972176666512e-09, + "loss": 0.6201, + "step": 7688 + }, + { + "epoch": 0.99, + "grad_norm": 1.2614445686340332, + "learning_rate": 5.792687471347646e-09, + "loss": 0.6168, + "step": 7689 + }, + { + "epoch": 0.99, + "grad_norm": 1.807807207107544, + "learning_rate": 5.693262916726672e-09, + "loss": 0.5201, + "step": 7690 + }, + { + "epoch": 0.99, + "grad_norm": 1.3365129232406616, + "learning_rate": 5.594698529928777e-09, + "loss": 0.5822, + "step": 7691 + }, + { + "epoch": 0.99, + "grad_norm": 1.4841192960739136, + "learning_rate": 5.496994327928717e-09, + "loss": 0.5317, + "step": 7692 + }, + { + "epoch": 0.99, + "grad_norm": 1.2918846607208252, + "learning_rate": 5.4001503275546984e-09, + "loss": 0.651, + "step": 7693 + }, + { + "epoch": 0.99, + "grad_norm": 1.43020761013031, + "learning_rate": 5.304166545485046e-09, + "loss": 0.6198, + "step": 7694 + }, + { + "epoch": 0.99, + "grad_norm": 1.3098740577697754, + "learning_rate": 5.209042998252645e-09, + "loss": 0.5555, + "step": 7695 + }, + { + "epoch": 0.99, + "grad_norm": 1.4694414138793945, + "learning_rate": 5.114779702238837e-09, + "loss": 0.6245, + "step": 7696 + }, + { + "epoch": 0.99, + "grad_norm": 1.2974458932876587, + "learning_rate": 5.021376673680078e-09, + "loss": 0.6221, + "step": 7697 + }, + { + "epoch": 0.99, + "grad_norm": 1.057411789894104, + "learning_rate": 4.928833928661836e-09, + "loss": 0.6633, + "step": 7698 + }, + { + "epoch": 0.99, + "grad_norm": 1.225907325744629, + "learning_rate": 4.837151483123581e-09, + "loss": 0.5786, + "step": 7699 + }, + { + "epoch": 0.99, + "grad_norm": 1.4693338871002197, + "learning_rate": 4.746329352856016e-09, + "loss": 0.5585, + "step": 7700 + }, + { + "epoch": 0.99, + "grad_norm": 1.21501624584198, + "learning_rate": 4.6563675534999635e-09, + "loss": 0.5666, + "step": 7701 + }, + { + "epoch": 0.99, + "grad_norm": 1.2666512727737427, + "learning_rate": 4.567266100550805e-09, + "loss": 0.5707, + "step": 7702 + }, + { + "epoch": 0.99, + "grad_norm": 1.1822082996368408, + "learning_rate": 4.4790250093545984e-09, + "loss": 0.474, + "step": 7703 + }, + { + "epoch": 0.99, + "grad_norm": 1.4030053615570068, + "learning_rate": 4.3916442951080775e-09, + "loss": 0.5502, + "step": 7704 + }, + { + "epoch": 0.99, + "grad_norm": 1.361227035522461, + "learning_rate": 4.305123972861424e-09, + "loss": 0.5675, + "step": 7705 + }, + { + "epoch": 0.99, + "grad_norm": 1.310667634010315, + "learning_rate": 4.219464057516054e-09, + "loss": 0.6493, + "step": 7706 + }, + { + "epoch": 0.99, + "grad_norm": 1.2178518772125244, + "learning_rate": 4.1346645638246084e-09, + "loss": 0.5146, + "step": 7707 + }, + { + "epoch": 0.99, + "grad_norm": 1.443044900894165, + "learning_rate": 4.0507255063926275e-09, + "loss": 0.6149, + "step": 7708 + }, + { + "epoch": 0.99, + "grad_norm": 1.3189752101898193, + "learning_rate": 3.96764689967688e-09, + "loss": 0.5632, + "step": 7709 + }, + { + "epoch": 0.99, + "grad_norm": 1.260591983795166, + "learning_rate": 3.885428757985366e-09, + "loss": 0.5505, + "step": 7710 + }, + { + "epoch": 0.99, + "grad_norm": 1.4206947088241577, + "learning_rate": 3.804071095479533e-09, + "loss": 0.614, + "step": 7711 + }, + { + "epoch": 0.99, + "grad_norm": 1.2580530643463135, + "learning_rate": 3.7235739261703983e-09, + "loss": 0.5879, + "step": 7712 + }, + { + "epoch": 0.99, + "grad_norm": 1.525988221168518, + "learning_rate": 3.643937263922981e-09, + "loss": 0.6059, + "step": 7713 + }, + { + "epoch": 0.99, + "grad_norm": 1.3162834644317627, + "learning_rate": 3.5651611224524207e-09, + "loss": 0.6687, + "step": 7714 + }, + { + "epoch": 0.99, + "grad_norm": 1.2362546920776367, + "learning_rate": 3.4872455153267536e-09, + "loss": 0.6078, + "step": 7715 + }, + { + "epoch": 0.99, + "grad_norm": 1.3531043529510498, + "learning_rate": 3.41019045596469e-09, + "loss": 0.5203, + "step": 7716 + }, + { + "epoch": 0.99, + "grad_norm": 1.5174033641815186, + "learning_rate": 3.3339959576378365e-09, + "loss": 0.5899, + "step": 7717 + }, + { + "epoch": 0.99, + "grad_norm": 1.8559000492095947, + "learning_rate": 3.258662033469584e-09, + "loss": 0.5911, + "step": 7718 + }, + { + "epoch": 0.99, + "grad_norm": 1.32603919506073, + "learning_rate": 3.1841886964339987e-09, + "loss": 0.6394, + "step": 7719 + }, + { + "epoch": 0.99, + "grad_norm": 1.5879665613174438, + "learning_rate": 3.110575959358042e-09, + "loss": 0.59, + "step": 7720 + }, + { + "epoch": 0.99, + "grad_norm": 1.3852684497833252, + "learning_rate": 3.037823834919906e-09, + "loss": 0.5394, + "step": 7721 + }, + { + "epoch": 0.99, + "grad_norm": 1.2205983400344849, + "learning_rate": 2.9659323356490134e-09, + "loss": 0.7311, + "step": 7722 + }, + { + "epoch": 0.99, + "grad_norm": 1.312951922416687, + "learning_rate": 2.8949014739282354e-09, + "loss": 0.4707, + "step": 7723 + }, + { + "epoch": 0.99, + "grad_norm": 1.5596814155578613, + "learning_rate": 2.8247312619905653e-09, + "loss": 0.606, + "step": 7724 + }, + { + "epoch": 0.99, + "grad_norm": 1.2269487380981445, + "learning_rate": 2.7554217119213354e-09, + "loss": 0.5177, + "step": 7725 + }, + { + "epoch": 0.99, + "grad_norm": 1.2579458951950073, + "learning_rate": 2.6869728356582192e-09, + "loss": 0.55, + "step": 7726 + }, + { + "epoch": 0.99, + "grad_norm": 1.6828994750976562, + "learning_rate": 2.6193846449901193e-09, + "loss": 0.6079, + "step": 7727 + }, + { + "epoch": 0.99, + "grad_norm": 1.452315330505371, + "learning_rate": 2.5526571515571696e-09, + "loss": 0.5791, + "step": 7728 + }, + { + "epoch": 0.99, + "grad_norm": 1.2305513620376587, + "learning_rate": 2.4867903668518433e-09, + "loss": 0.5592, + "step": 7729 + }, + { + "epoch": 0.99, + "grad_norm": 1.548684000968933, + "learning_rate": 2.4217843022189546e-09, + "loss": 0.4734, + "step": 7730 + }, + { + "epoch": 0.99, + "grad_norm": 1.631661295890808, + "learning_rate": 2.3576389688539924e-09, + "loss": 0.5776, + "step": 7731 + }, + { + "epoch": 0.99, + "grad_norm": 1.554509162902832, + "learning_rate": 2.294354377804786e-09, + "loss": 0.604, + "step": 7732 + }, + { + "epoch": 0.99, + "grad_norm": 1.5531501770019531, + "learning_rate": 2.2319305399715054e-09, + "loss": 0.5192, + "step": 7733 + }, + { + "epoch": 0.99, + "grad_norm": 1.3193473815917969, + "learning_rate": 2.170367466103884e-09, + "loss": 0.5963, + "step": 7734 + }, + { + "epoch": 0.99, + "grad_norm": 1.141550898551941, + "learning_rate": 2.1096651668062184e-09, + "loss": 0.5749, + "step": 7735 + }, + { + "epoch": 0.99, + "grad_norm": 1.7103443145751953, + "learning_rate": 2.0498236525329228e-09, + "loss": 0.5586, + "step": 7736 + }, + { + "epoch": 0.99, + "grad_norm": 1.4387218952178955, + "learning_rate": 1.990842933590198e-09, + "loss": 0.5993, + "step": 7737 + }, + { + "epoch": 0.99, + "grad_norm": 1.195109486579895, + "learning_rate": 1.932723020136584e-09, + "loss": 0.5556, + "step": 7738 + }, + { + "epoch": 0.99, + "grad_norm": 1.2314635515213013, + "learning_rate": 1.875463922181853e-09, + "loss": 0.5436, + "step": 7739 + }, + { + "epoch": 0.99, + "grad_norm": 1.4578646421432495, + "learning_rate": 1.8190656495875591e-09, + "loss": 0.6199, + "step": 7740 + }, + { + "epoch": 0.99, + "grad_norm": 1.4149136543273926, + "learning_rate": 1.7635282120676e-09, + "loss": 0.5495, + "step": 7741 + }, + { + "epoch": 0.99, + "grad_norm": 1.971264362335205, + "learning_rate": 1.7088516191871018e-09, + "loss": 0.6472, + "step": 7742 + }, + { + "epoch": 0.99, + "grad_norm": 1.5405484437942505, + "learning_rate": 1.655035880362421e-09, + "loss": 0.5643, + "step": 7743 + }, + { + "epoch": 0.99, + "grad_norm": 1.398697853088379, + "learning_rate": 1.602081004863365e-09, + "loss": 0.5151, + "step": 7744 + }, + { + "epoch": 0.99, + "grad_norm": 1.603548288345337, + "learning_rate": 1.5499870018093054e-09, + "loss": 0.5618, + "step": 7745 + }, + { + "epoch": 0.99, + "grad_norm": 2.4291934967041016, + "learning_rate": 1.49875388017251e-09, + "loss": 0.7038, + "step": 7746 + }, + { + "epoch": 0.99, + "grad_norm": 1.4780473709106445, + "learning_rate": 1.4483816487770309e-09, + "loss": 0.6409, + "step": 7747 + }, + { + "epoch": 0.99, + "grad_norm": 1.2052099704742432, + "learning_rate": 1.398870316298151e-09, + "loss": 0.5907, + "step": 7748 + }, + { + "epoch": 0.99, + "grad_norm": 1.4878346920013428, + "learning_rate": 1.3502198912640484e-09, + "loss": 0.649, + "step": 7749 + }, + { + "epoch": 0.99, + "grad_norm": 1.4148633480072021, + "learning_rate": 1.3024303820530216e-09, + "loss": 0.5942, + "step": 7750 + }, + { + "epoch": 0.99, + "grad_norm": 1.2521181106567383, + "learning_rate": 1.2555017968962635e-09, + "loss": 0.5044, + "step": 7751 + }, + { + "epoch": 0.99, + "grad_norm": 1.4545745849609375, + "learning_rate": 1.209434143875643e-09, + "loss": 0.5946, + "step": 7752 + }, + { + "epoch": 0.99, + "grad_norm": 1.1492974758148193, + "learning_rate": 1.1642274309259238e-09, + "loss": 0.6968, + "step": 7753 + }, + { + "epoch": 0.99, + "grad_norm": 1.2396249771118164, + "learning_rate": 1.1198816658331003e-09, + "loss": 0.4695, + "step": 7754 + }, + { + "epoch": 0.99, + "grad_norm": 1.2736849784851074, + "learning_rate": 1.0763968562343963e-09, + "loss": 0.5964, + "step": 7755 + }, + { + "epoch": 0.99, + "grad_norm": 3.6767706871032715, + "learning_rate": 1.0337730096199316e-09, + "loss": 0.567, + "step": 7756 + }, + { + "epoch": 0.99, + "grad_norm": 1.1457759141921997, + "learning_rate": 9.920101333293907e-10, + "loss": 0.4591, + "step": 7757 + }, + { + "epoch": 0.99, + "grad_norm": 1.395374059677124, + "learning_rate": 9.511082345570189e-10, + "loss": 0.5131, + "step": 7758 + }, + { + "epoch": 0.99, + "grad_norm": 1.0349416732788086, + "learning_rate": 9.110673203471809e-10, + "loss": 0.7146, + "step": 7759 + }, + { + "epoch": 0.99, + "grad_norm": 1.1731163263320923, + "learning_rate": 8.718873975949171e-10, + "loss": 0.6224, + "step": 7760 + }, + { + "epoch": 0.99, + "grad_norm": 1.4639376401901245, + "learning_rate": 8.335684730492733e-10, + "loss": 0.6381, + "step": 7761 + }, + { + "epoch": 0.99, + "grad_norm": 1.3412548303604126, + "learning_rate": 7.9611055330997e-10, + "loss": 0.5763, + "step": 7762 + }, + { + "epoch": 0.99, + "grad_norm": 1.3731943368911743, + "learning_rate": 7.595136448274032e-10, + "loss": 0.5907, + "step": 7763 + }, + { + "epoch": 0.99, + "grad_norm": 1.477068305015564, + "learning_rate": 7.237777539059743e-10, + "loss": 0.6016, + "step": 7764 + }, + { + "epoch": 0.99, + "grad_norm": 1.240449070930481, + "learning_rate": 6.889028866990943e-10, + "loss": 0.5382, + "step": 7765 + }, + { + "epoch": 0.99, + "grad_norm": 1.313112735748291, + "learning_rate": 6.548890492141802e-10, + "loss": 0.5249, + "step": 7766 + }, + { + "epoch": 1.0, + "grad_norm": 1.2798494100570679, + "learning_rate": 6.217362473093235e-10, + "loss": 0.6216, + "step": 7767 + }, + { + "epoch": 1.0, + "grad_norm": 2.7569220066070557, + "learning_rate": 5.894444866938465e-10, + "loss": 0.602, + "step": 7768 + }, + { + "epoch": 1.0, + "grad_norm": 1.4768586158752441, + "learning_rate": 5.580137729299661e-10, + "loss": 0.6307, + "step": 7769 + }, + { + "epoch": 1.0, + "grad_norm": 1.2594913244247437, + "learning_rate": 5.274441114305751e-10, + "loss": 0.562, + "step": 7770 + }, + { + "epoch": 1.0, + "grad_norm": 1.1919431686401367, + "learning_rate": 4.977355074609059e-10, + "loss": 0.5843, + "step": 7771 + }, + { + "epoch": 1.0, + "grad_norm": 1.5836091041564941, + "learning_rate": 4.688879661379764e-10, + "loss": 0.601, + "step": 7772 + }, + { + "epoch": 1.0, + "grad_norm": 1.3357963562011719, + "learning_rate": 4.409014924294797e-10, + "loss": 0.5608, + "step": 7773 + }, + { + "epoch": 1.0, + "grad_norm": 1.4678525924682617, + "learning_rate": 4.1377609115600403e-10, + "loss": 0.615, + "step": 7774 + }, + { + "epoch": 1.0, + "grad_norm": 1.2490729093551636, + "learning_rate": 3.8751176698936797e-10, + "loss": 0.5467, + "step": 7775 + }, + { + "epoch": 1.0, + "grad_norm": 1.4261091947555542, + "learning_rate": 3.621085244526201e-10, + "loss": 0.5542, + "step": 7776 + }, + { + "epoch": 1.0, + "grad_norm": 1.5678194761276245, + "learning_rate": 3.375663679211494e-10, + "loss": 0.5603, + "step": 7777 + }, + { + "epoch": 1.0, + "grad_norm": 1.3209941387176514, + "learning_rate": 3.138853016221299e-10, + "loss": 0.5321, + "step": 7778 + }, + { + "epoch": 1.0, + "grad_norm": 1.2347135543823242, + "learning_rate": 2.9106532963396605e-10, + "loss": 0.7119, + "step": 7779 + }, + { + "epoch": 1.0, + "grad_norm": 1.4404094219207764, + "learning_rate": 2.6910645588684726e-10, + "loss": 0.5572, + "step": 7780 + }, + { + "epoch": 1.0, + "grad_norm": 1.1381596326828003, + "learning_rate": 2.480086841633034e-10, + "loss": 0.6961, + "step": 7781 + }, + { + "epoch": 1.0, + "grad_norm": 1.4806857109069824, + "learning_rate": 2.2777201809598415e-10, + "loss": 0.6237, + "step": 7782 + }, + { + "epoch": 1.0, + "grad_norm": 1.4877007007598877, + "learning_rate": 2.0839646117098988e-10, + "loss": 0.5864, + "step": 7783 + }, + { + "epoch": 1.0, + "grad_norm": 1.6828969717025757, + "learning_rate": 1.8988201672509587e-10, + "loss": 0.6237, + "step": 7784 + }, + { + "epoch": 1.0, + "grad_norm": 1.460888385772705, + "learning_rate": 1.7222868794686264e-10, + "loss": 0.5588, + "step": 7785 + }, + { + "epoch": 1.0, + "grad_norm": 1.2123541831970215, + "learning_rate": 1.5543647787719108e-10, + "loss": 0.5466, + "step": 7786 + }, + { + "epoch": 1.0, + "grad_norm": 1.8442035913467407, + "learning_rate": 1.39505389407657e-10, + "loss": 0.5387, + "step": 7787 + }, + { + "epoch": 1.0, + "grad_norm": 1.310922384262085, + "learning_rate": 1.2443542528273178e-10, + "loss": 0.5123, + "step": 7788 + }, + { + "epoch": 1.0, + "grad_norm": 1.1582748889923096, + "learning_rate": 1.102265880975617e-10, + "loss": 0.6051, + "step": 7789 + }, + { + "epoch": 1.0, + "grad_norm": 1.2169848680496216, + "learning_rate": 9.687888029907832e-11, + "loss": 0.5885, + "step": 7790 + }, + { + "epoch": 1.0, + "grad_norm": 1.2619993686676025, + "learning_rate": 8.439230418655353e-11, + "loss": 0.5656, + "step": 7791 + }, + { + "epoch": 1.0, + "grad_norm": 1.3111398220062256, + "learning_rate": 7.27668619099342e-11, + "loss": 0.5407, + "step": 7792 + }, + { + "epoch": 1.0, + "grad_norm": 1.3473933935165405, + "learning_rate": 6.20025554726178e-11, + "loss": 0.575, + "step": 7793 + }, + { + "epoch": 1.0, + "grad_norm": 1.3455848693847656, + "learning_rate": 5.2099386727566535e-11, + "loss": 0.547, + "step": 7794 + }, + { + "epoch": 1.0, + "grad_norm": 1.5748672485351562, + "learning_rate": 4.305735738008299e-11, + "loss": 0.5743, + "step": 7795 + }, + { + "epoch": 1.0, + "grad_norm": 1.4658571481704712, + "learning_rate": 3.4876468988920275e-11, + "loss": 0.5293, + "step": 7796 + }, + { + "epoch": 1.0, + "grad_norm": 1.108062505722046, + "learning_rate": 2.7556722961841197e-11, + "loss": 0.6937, + "step": 7797 + }, + { + "epoch": 1.0, + "grad_norm": 1.1840686798095703, + "learning_rate": 2.1098120559503998e-11, + "loss": 0.476, + "step": 7798 + }, + { + "epoch": 1.0, + "grad_norm": 1.324550747871399, + "learning_rate": 1.550066289490726e-11, + "loss": 0.5883, + "step": 7799 + }, + { + "epoch": 1.0, + "grad_norm": 1.7365845441818237, + "learning_rate": 1.0764350932279676e-11, + "loss": 0.5685, + "step": 7800 + }, + { + "epoch": 1.0, + "grad_norm": 1.3814128637313843, + "learning_rate": 6.889185486524952e-12, + "loss": 0.5706, + "step": 7801 + }, + { + "epoch": 1.0, + "grad_norm": 1.1768162250518799, + "learning_rate": 3.875167225442234e-12, + "loss": 0.5444, + "step": 7802 + }, + { + "epoch": 1.0, + "grad_norm": 1.977554440498352, + "learning_rate": 1.7222966680607856e-12, + "loss": 0.5997, + "step": 7803 + }, + { + "epoch": 1.0, + "grad_norm": 1.1297414302825928, + "learning_rate": 4.3057418575020993e-13, + "loss": 0.6249, + "step": 7804 + }, + { + "epoch": 1.0, + "grad_norm": 1.294604778289795, + "learning_rate": 0.0, + "loss": 0.5606, + "step": 7805 + }, + { + "epoch": 1.0, + "step": 7805, + "total_flos": 1.1237568065308721e+19, + "train_loss": 0.6154153387490028, + "train_runtime": 23859.4346, + "train_samples_per_second": 41.876, + "train_steps_per_second": 0.327 + } + ], + "logging_steps": 1.0, + "max_steps": 7805, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 1.1237568065308721e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}